2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
27 * Based on work by Owen Taylor and Søren Sandmann
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
38 #include "pixman-fast-path.h"
40 #if defined(_MSC_VER) && defined(_M_AMD64)
41 /* Windows 64 doesn't allow MMX to be used, so
42 * the pixman-x64-mmx-emulation.h file contains
43 * implementations of those MMX intrinsics that
44 * are used in the SSE2 implementation.
46 # include "pixman-x64-mmx-emulation.h"
51 /* --------------------------------------------------------------------
55 static __m128i mask_0080;
56 static __m128i mask_00ff;
57 static __m128i mask_0101;
58 static __m128i mask_ffff;
59 static __m128i mask_ff000000;
60 static __m128i mask_alpha;
62 static __m128i mask_565_r;
63 static __m128i mask_565_g1, mask_565_g2;
64 static __m128i mask_565_b;
65 static __m128i mask_red;
66 static __m128i mask_green;
67 static __m128i mask_blue;
69 static __m128i mask_565_fix_rb;
70 static __m128i mask_565_fix_g;
72 /* ----------------------------------------------------------------------
75 static force_inline __m128i
76 unpack_32_1x128 (uint32_t data)
78 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
81 static force_inline void
82 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
84 *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
85 *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
88 static force_inline __m128i
89 unpack_565_to_8888 (__m128i lo)
91 __m128i r, g, b, rb, t;
93 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
94 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
95 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
97 rb = _mm_or_si128 (r, b);
98 t = _mm_and_si128 (rb, mask_565_fix_rb);
99 t = _mm_srli_epi32 (t, 5);
100 rb = _mm_or_si128 (rb, t);
102 t = _mm_and_si128 (g, mask_565_fix_g);
103 t = _mm_srli_epi32 (t, 6);
104 g = _mm_or_si128 (g, t);
106 return _mm_or_si128 (rb, g);
109 static force_inline void
110 unpack_565_128_4x128 (__m128i data,
118 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
119 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
121 lo = unpack_565_to_8888 (lo);
122 hi = unpack_565_to_8888 (hi);
124 unpack_128_2x128 (lo, data0, data1);
125 unpack_128_2x128 (hi, data2, data3);
128 static force_inline uint16_t
129 pack_565_32_16 (uint32_t pixel)
131 return (uint16_t) (((pixel >> 8) & 0xf800) |
132 ((pixel >> 5) & 0x07e0) |
133 ((pixel >> 3) & 0x001f));
136 static force_inline __m128i
137 pack_2x128_128 (__m128i lo, __m128i hi)
139 return _mm_packus_epi16 (lo, hi);
142 static force_inline __m128i
143 pack_565_2x128_128 (__m128i lo, __m128i hi)
146 __m128i r, g1, g2, b;
148 data = pack_2x128_128 (lo, hi);
150 r = _mm_and_si128 (data, mask_565_r);
151 g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
152 g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
153 b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
155 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
158 static force_inline __m128i
159 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
161 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
162 pack_565_2x128_128 (*xmm2, *xmm3));
165 static force_inline int
166 is_opaque (__m128i x)
168 __m128i ffs = _mm_cmpeq_epi8 (x, x);
170 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
173 static force_inline int
176 return _mm_movemask_epi8 (
177 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
180 static force_inline int
181 is_transparent (__m128i x)
183 return (_mm_movemask_epi8 (
184 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
187 static force_inline __m128i
188 expand_pixel_32_1x128 (uint32_t data)
190 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
193 static force_inline __m128i
194 expand_alpha_1x128 (__m128i data)
196 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
197 _MM_SHUFFLE (3, 3, 3, 3)),
198 _MM_SHUFFLE (3, 3, 3, 3));
201 static force_inline void
202 expand_alpha_2x128 (__m128i data_lo,
209 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
210 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
212 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
213 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
216 static force_inline void
217 expand_alpha_rev_2x128 (__m128i data_lo,
224 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
225 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
226 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
227 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
230 static force_inline void
231 pix_multiply_2x128 (__m128i* data_lo,
240 lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
241 hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
242 lo = _mm_adds_epu16 (lo, mask_0080);
243 hi = _mm_adds_epu16 (hi, mask_0080);
244 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
245 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
248 static force_inline void
249 pix_add_multiply_2x128 (__m128i* src_lo,
251 __m128i* alpha_dst_lo,
252 __m128i* alpha_dst_hi,
255 __m128i* alpha_src_lo,
256 __m128i* alpha_src_hi,
260 __m128i t1_lo, t1_hi;
261 __m128i t2_lo, t2_hi;
263 pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
264 pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
266 *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
267 *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
270 static force_inline void
271 negate_2x128 (__m128i data_lo,
276 *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
277 *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
280 static force_inline void
281 invert_colors_2x128 (__m128i data_lo,
288 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
289 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
290 *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
291 *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
294 static force_inline void
295 over_2x128 (__m128i* src_lo,
304 negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
306 pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
308 *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
309 *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
312 static force_inline void
313 over_rev_non_pre_2x128 (__m128i src_lo,
319 __m128i alpha_lo, alpha_hi;
321 expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
323 lo = _mm_or_si128 (alpha_lo, mask_alpha);
324 hi = _mm_or_si128 (alpha_hi, mask_alpha);
326 invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
328 pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
330 over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
333 static force_inline void
334 in_over_2x128 (__m128i* src_lo,
346 pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
347 pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
349 over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
352 /* load 4 pixels from a 16-byte boundary aligned address */
353 static force_inline __m128i
354 load_128_aligned (__m128i* src)
356 return _mm_load_si128 (src);
359 /* load 4 pixels from a unaligned address */
360 static force_inline __m128i
361 load_128_unaligned (const __m128i* src)
363 return _mm_loadu_si128 (src);
366 /* save 4 pixels using Write Combining memory on a 16-byte
367 * boundary aligned address
369 static force_inline void
370 save_128_write_combining (__m128i* dst,
373 _mm_stream_si128 (dst, data);
376 /* save 4 pixels on a 16-byte boundary aligned address */
377 static force_inline void
378 save_128_aligned (__m128i* dst,
381 _mm_store_si128 (dst, data);
384 /* save 4 pixels on a unaligned address */
385 static force_inline void
386 save_128_unaligned (__m128i* dst,
389 _mm_storeu_si128 (dst, data);
392 /* ------------------------------------------------------------------
396 static force_inline __m128i
397 load_32_1x128 (uint32_t data)
399 return _mm_cvtsi32_si128 (data);
402 static force_inline __m128i
403 expand_alpha_rev_1x128 (__m128i data)
405 return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
408 static force_inline __m128i
409 expand_pixel_8_1x128 (uint8_t data)
411 return _mm_shufflelo_epi16 (
412 unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
415 static force_inline __m128i
416 pix_multiply_1x128 (__m128i data,
419 return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
424 static force_inline __m128i
425 pix_add_multiply_1x128 (__m128i* src,
430 __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
431 __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
433 return _mm_adds_epu8 (t1, t2);
436 static force_inline __m128i
437 negate_1x128 (__m128i data)
439 return _mm_xor_si128 (data, mask_00ff);
442 static force_inline __m128i
443 invert_colors_1x128 (__m128i data)
445 return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
448 static force_inline __m128i
449 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
451 return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
454 static force_inline __m128i
455 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
457 return over_1x128 (pix_multiply_1x128 (*src, *mask),
458 pix_multiply_1x128 (*alpha, *mask),
462 static force_inline __m128i
463 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
465 __m128i alpha = expand_alpha_1x128 (src);
467 return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
468 _mm_or_si128 (alpha, mask_alpha)),
473 static force_inline uint32_t
474 pack_1x128_32 (__m128i data)
476 return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
479 static force_inline __m128i
480 expand565_16_1x128 (uint16_t pixel)
482 __m128i m = _mm_cvtsi32_si128 (pixel);
484 m = unpack_565_to_8888 (m);
486 return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
489 /* ----------------------------------------------------------------------------
490 * Compose Core transformations
492 static force_inline uint32_t
493 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
506 xmms = unpack_32_1x128 (src);
507 return pack_1x128_32 (
508 over_1x128 (xmms, expand_alpha_1x128 (xmms),
509 unpack_32_1x128 (dst)));
515 static force_inline uint32_t
516 combine1 (const uint32_t *ps, const uint32_t *pm)
524 mm = unpack_32_1x128 (*pm);
525 mm = expand_alpha_1x128 (mm);
527 ms = unpack_32_1x128 (s);
528 ms = pix_multiply_1x128 (ms, mm);
530 s = pack_1x128_32 (ms);
536 static force_inline __m128i
537 combine4 (const __m128i *ps, const __m128i *pm)
539 __m128i xmm_src_lo, xmm_src_hi;
540 __m128i xmm_msk_lo, xmm_msk_hi;
545 xmm_msk_lo = load_128_unaligned (pm);
547 if (is_transparent (xmm_msk_lo))
548 return _mm_setzero_si128 ();
551 s = load_128_unaligned (ps);
555 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
556 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
558 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
560 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
561 &xmm_msk_lo, &xmm_msk_hi,
562 &xmm_src_lo, &xmm_src_hi);
564 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
570 static force_inline void
571 core_combine_over_u_sse2_mask (uint32_t * pd,
578 /* Align dst on a 16-byte boundary */
579 while (w && ((unsigned long)pd & 15))
582 s = combine1 (ps, pm);
585 *pd = core_combine_over_u_pixel_sse2 (s, d);
594 __m128i mask = load_128_unaligned ((__m128i *)pm);
599 __m128i src_hi, src_lo;
600 __m128i mask_hi, mask_lo;
601 __m128i alpha_hi, alpha_lo;
603 src = load_128_unaligned ((__m128i *)ps);
605 if (is_opaque (_mm_and_si128 (src, mask)))
607 save_128_aligned ((__m128i *)pd, src);
611 __m128i dst = load_128_aligned ((__m128i *)pd);
612 __m128i dst_hi, dst_lo;
614 unpack_128_2x128 (mask, &mask_lo, &mask_hi);
615 unpack_128_2x128 (src, &src_lo, &src_hi);
617 expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
618 pix_multiply_2x128 (&src_lo, &src_hi,
622 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
624 expand_alpha_2x128 (src_lo, src_hi,
625 &alpha_lo, &alpha_hi);
627 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
632 pack_2x128_128 (dst_lo, dst_hi));
644 s = combine1 (ps, pm);
647 *pd = core_combine_over_u_pixel_sse2 (s, d);
656 static force_inline void
657 core_combine_over_u_sse2_no_mask (uint32_t * pd,
663 /* Align dst on a 16-byte boundary */
664 while (w && ((unsigned long)pd & 15))
670 *pd = core_combine_over_u_pixel_sse2 (s, d);
679 __m128i src_hi, src_lo, dst_hi, dst_lo;
680 __m128i alpha_hi, alpha_lo;
682 src = load_128_unaligned ((__m128i *)ps);
688 save_128_aligned ((__m128i *)pd, src);
692 __m128i dst = load_128_aligned ((__m128i *)pd);
694 unpack_128_2x128 (src, &src_lo, &src_hi);
695 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
697 expand_alpha_2x128 (src_lo, src_hi,
698 &alpha_lo, &alpha_hi);
699 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
704 pack_2x128_128 (dst_lo, dst_hi));
718 *pd = core_combine_over_u_pixel_sse2 (s, d);
726 static force_inline void
727 sse2_combine_over_u (pixman_implementation_t *imp,
735 core_combine_over_u_sse2_mask (pd, ps, pm, w);
737 core_combine_over_u_sse2_no_mask (pd, ps, w);
741 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
750 __m128i xmm_dst_lo, xmm_dst_hi;
751 __m128i xmm_src_lo, xmm_src_hi;
752 __m128i xmm_alpha_lo, xmm_alpha_hi;
754 /* Align dst on a 16-byte boundary */
756 ((unsigned long)pd & 15))
759 s = combine1 (ps, pm);
761 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
770 /* I'm loading unaligned because I'm not sure
771 * about the address alignment.
773 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
774 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
776 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
777 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
779 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
780 &xmm_alpha_lo, &xmm_alpha_hi);
782 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
783 &xmm_alpha_lo, &xmm_alpha_hi,
784 &xmm_src_lo, &xmm_src_hi);
786 /* rebuid the 4 pixel data and save*/
787 save_128_aligned ((__m128i*)pd,
788 pack_2x128_128 (xmm_src_lo, xmm_src_hi));
801 s = combine1 (ps, pm);
803 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
811 static force_inline uint32_t
812 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
814 uint32_t maska = src >> 24;
820 else if (maska != 0xff)
822 return pack_1x128_32 (
823 pix_multiply_1x128 (unpack_32_1x128 (dst),
824 expand_alpha_1x128 (unpack_32_1x128 (src))));
831 sse2_combine_in_u (pixman_implementation_t *imp,
840 __m128i xmm_src_lo, xmm_src_hi;
841 __m128i xmm_dst_lo, xmm_dst_hi;
843 while (w && ((unsigned long) pd & 15))
845 s = combine1 (ps, pm);
848 *pd++ = core_combine_in_u_pixel_sse2 (d, s);
857 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
858 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
860 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
861 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
863 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
864 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
865 &xmm_dst_lo, &xmm_dst_hi,
866 &xmm_dst_lo, &xmm_dst_hi);
868 save_128_aligned ((__m128i*)pd,
869 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
880 s = combine1 (ps, pm);
883 *pd++ = core_combine_in_u_pixel_sse2 (d, s);
892 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
901 __m128i xmm_src_lo, xmm_src_hi;
902 __m128i xmm_dst_lo, xmm_dst_hi;
904 while (w && ((unsigned long) pd & 15))
906 s = combine1 (ps, pm);
909 *pd++ = core_combine_in_u_pixel_sse2 (s, d);
918 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
919 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
921 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
922 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
924 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
925 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
926 &xmm_src_lo, &xmm_src_hi,
927 &xmm_dst_lo, &xmm_dst_hi);
930 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
941 s = combine1 (ps, pm);
944 *pd++ = core_combine_in_u_pixel_sse2 (s, d);
953 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
960 while (w && ((unsigned long) pd & 15))
962 uint32_t s = combine1 (ps, pm);
965 *pd++ = pack_1x128_32 (
967 unpack_32_1x128 (d), negate_1x128 (
968 expand_alpha_1x128 (unpack_32_1x128 (s)))));
978 __m128i xmm_src_lo, xmm_src_hi;
979 __m128i xmm_dst_lo, xmm_dst_hi;
981 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
982 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
984 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
985 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
987 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
988 negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
990 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
991 &xmm_src_lo, &xmm_src_hi,
992 &xmm_dst_lo, &xmm_dst_hi);
995 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1007 uint32_t s = combine1 (ps, pm);
1010 *pd++ = pack_1x128_32 (
1011 pix_multiply_1x128 (
1012 unpack_32_1x128 (d), negate_1x128 (
1013 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1022 sse2_combine_out_u (pixman_implementation_t *imp,
1025 const uint32_t * ps,
1026 const uint32_t * pm,
1029 while (w && ((unsigned long) pd & 15))
1031 uint32_t s = combine1 (ps, pm);
1034 *pd++ = pack_1x128_32 (
1035 pix_multiply_1x128 (
1036 unpack_32_1x128 (s), negate_1x128 (
1037 expand_alpha_1x128 (unpack_32_1x128 (d)))));
1046 __m128i xmm_src_lo, xmm_src_hi;
1047 __m128i xmm_dst_lo, xmm_dst_hi;
1049 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1050 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1052 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1053 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1055 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1056 negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1058 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1059 &xmm_dst_lo, &xmm_dst_hi,
1060 &xmm_dst_lo, &xmm_dst_hi);
1063 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1074 uint32_t s = combine1 (ps, pm);
1077 *pd++ = pack_1x128_32 (
1078 pix_multiply_1x128 (
1079 unpack_32_1x128 (s), negate_1x128 (
1080 expand_alpha_1x128 (unpack_32_1x128 (d)))));
1088 static force_inline uint32_t
1089 core_combine_atop_u_pixel_sse2 (uint32_t src,
1092 __m128i s = unpack_32_1x128 (src);
1093 __m128i d = unpack_32_1x128 (dst);
1095 __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
1096 __m128i da = expand_alpha_1x128 (d);
1098 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1102 sse2_combine_atop_u (pixman_implementation_t *imp,
1105 const uint32_t * ps,
1106 const uint32_t * pm,
1111 __m128i xmm_src_lo, xmm_src_hi;
1112 __m128i xmm_dst_lo, xmm_dst_hi;
1113 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1114 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1116 while (w && ((unsigned long) pd & 15))
1118 s = combine1 (ps, pm);
1121 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1130 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1131 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1133 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1134 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1136 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1137 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1138 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1139 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1141 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1142 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1144 pix_add_multiply_2x128 (
1145 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1146 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1147 &xmm_dst_lo, &xmm_dst_hi);
1150 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1161 s = combine1 (ps, pm);
1164 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1172 static force_inline uint32_t
1173 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1176 __m128i s = unpack_32_1x128 (src);
1177 __m128i d = unpack_32_1x128 (dst);
1179 __m128i sa = expand_alpha_1x128 (s);
1180 __m128i da = negate_1x128 (expand_alpha_1x128 (d));
1182 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1186 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
1189 const uint32_t * ps,
1190 const uint32_t * pm,
1195 __m128i xmm_src_lo, xmm_src_hi;
1196 __m128i xmm_dst_lo, xmm_dst_hi;
1197 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1198 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1200 while (w && ((unsigned long) pd & 15))
1202 s = combine1 (ps, pm);
1205 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1214 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1215 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1217 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1218 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1220 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1221 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1222 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1223 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1225 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1226 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1228 pix_add_multiply_2x128 (
1229 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1230 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1231 &xmm_dst_lo, &xmm_dst_hi);
1234 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1245 s = combine1 (ps, pm);
1248 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1256 static force_inline uint32_t
1257 core_combine_xor_u_pixel_sse2 (uint32_t src,
1260 __m128i s = unpack_32_1x128 (src);
1261 __m128i d = unpack_32_1x128 (dst);
1263 __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
1264 __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
1266 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
1270 sse2_combine_xor_u (pixman_implementation_t *imp,
1273 const uint32_t * src,
1274 const uint32_t * mask,
1280 const uint32_t* ps = src;
1281 const uint32_t* pm = mask;
1283 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1284 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1285 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1286 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1288 while (w && ((unsigned long) pd & 15))
1290 s = combine1 (ps, pm);
1293 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1302 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1303 xmm_dst = load_128_aligned ((__m128i*) pd);
1305 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1306 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1308 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1309 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1310 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1311 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1313 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1314 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1315 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1316 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1318 pix_add_multiply_2x128 (
1319 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1320 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1321 &xmm_dst_lo, &xmm_dst_hi);
1324 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1335 s = combine1 (ps, pm);
1338 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1346 static force_inline void
1347 sse2_combine_add_u (pixman_implementation_t *imp,
1350 const uint32_t * src,
1351 const uint32_t * mask,
1357 const uint32_t* ps = src;
1358 const uint32_t* pm = mask;
1360 while (w && (unsigned long)pd & 15)
1362 s = combine1 (ps, pm);
1368 *pd++ = _mm_cvtsi128_si32 (
1369 _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1377 s = combine4 ((__m128i*)ps, (__m128i*)pm);
1380 (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
1391 s = combine1 (ps, pm);
1395 *pd++ = _mm_cvtsi128_si32 (
1396 _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1402 static force_inline uint32_t
1403 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1406 __m128i ms = unpack_32_1x128 (src);
1407 __m128i md = unpack_32_1x128 (dst);
1408 uint32_t sa = src >> 24;
1409 uint32_t da = ~dst >> 24;
1413 ms = pix_multiply_1x128 (
1414 ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
1417 return pack_1x128_32 (_mm_adds_epu16 (md, ms));
1421 sse2_combine_saturate_u (pixman_implementation_t *imp,
1424 const uint32_t * ps,
1425 const uint32_t * pm,
1431 __m128i xmm_src, xmm_dst;
1433 while (w && (unsigned long)pd & 15)
1435 s = combine1 (ps, pm);
1438 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1447 xmm_dst = load_128_aligned ((__m128i*)pd);
1448 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1450 pack_cmp = _mm_movemask_epi8 (
1452 _mm_srli_epi32 (xmm_src, 24),
1453 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1455 /* if some alpha src is grater than respective ~alpha dst */
1458 s = combine1 (ps++, pm);
1460 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1464 s = combine1 (ps++, pm);
1466 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1470 s = combine1 (ps++, pm);
1472 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1476 s = combine1 (ps++, pm);
1478 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1484 save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1497 s = combine1 (ps, pm);
1500 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1508 sse2_combine_src_ca (pixman_implementation_t *imp,
1511 const uint32_t * ps,
1512 const uint32_t * pm,
1517 __m128i xmm_src_lo, xmm_src_hi;
1518 __m128i xmm_mask_lo, xmm_mask_hi;
1519 __m128i xmm_dst_lo, xmm_dst_hi;
1521 while (w && (unsigned long)pd & 15)
1525 *pd++ = pack_1x128_32 (
1526 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1532 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1533 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1535 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1536 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1538 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1539 &xmm_mask_lo, &xmm_mask_hi,
1540 &xmm_dst_lo, &xmm_dst_hi);
1543 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1555 *pd++ = pack_1x128_32 (
1556 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1561 static force_inline uint32_t
1562 core_combine_over_ca_pixel_sse2 (uint32_t src,
1566 __m128i s = unpack_32_1x128 (src);
1567 __m128i expAlpha = expand_alpha_1x128 (s);
1568 __m128i unpk_mask = unpack_32_1x128 (mask);
1569 __m128i unpk_dst = unpack_32_1x128 (dst);
1571 return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1575 sse2_combine_over_ca (pixman_implementation_t *imp,
1578 const uint32_t * ps,
1579 const uint32_t * pm,
1584 __m128i xmm_alpha_lo, xmm_alpha_hi;
1585 __m128i xmm_src_lo, xmm_src_hi;
1586 __m128i xmm_dst_lo, xmm_dst_hi;
1587 __m128i xmm_mask_lo, xmm_mask_hi;
1589 while (w && (unsigned long)pd & 15)
1595 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1601 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1602 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1603 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1605 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1606 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1607 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1609 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1610 &xmm_alpha_lo, &xmm_alpha_hi);
1612 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1613 &xmm_alpha_lo, &xmm_alpha_hi,
1614 &xmm_mask_lo, &xmm_mask_hi,
1615 &xmm_dst_lo, &xmm_dst_hi);
1618 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1632 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1637 static force_inline uint32_t
1638 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1642 __m128i d = unpack_32_1x128 (dst);
1644 return pack_1x128_32 (
1645 over_1x128 (d, expand_alpha_1x128 (d),
1646 pix_multiply_1x128 (unpack_32_1x128 (src),
1647 unpack_32_1x128 (mask))));
1651 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
1654 const uint32_t * ps,
1655 const uint32_t * pm,
1660 __m128i xmm_alpha_lo, xmm_alpha_hi;
1661 __m128i xmm_src_lo, xmm_src_hi;
1662 __m128i xmm_dst_lo, xmm_dst_hi;
1663 __m128i xmm_mask_lo, xmm_mask_hi;
1665 while (w && (unsigned long)pd & 15)
1671 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1677 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1678 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1679 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1681 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1682 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1683 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1685 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1686 &xmm_alpha_lo, &xmm_alpha_hi);
1687 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1688 &xmm_mask_lo, &xmm_mask_hi,
1689 &xmm_mask_lo, &xmm_mask_hi);
1691 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1692 &xmm_alpha_lo, &xmm_alpha_hi,
1693 &xmm_mask_lo, &xmm_mask_hi);
1696 (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1710 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1716 sse2_combine_in_ca (pixman_implementation_t *imp,
1719 const uint32_t * ps,
1720 const uint32_t * pm,
1725 __m128i xmm_alpha_lo, xmm_alpha_hi;
1726 __m128i xmm_src_lo, xmm_src_hi;
1727 __m128i xmm_dst_lo, xmm_dst_hi;
1728 __m128i xmm_mask_lo, xmm_mask_hi;
1730 while (w && (unsigned long)pd & 15)
1736 *pd++ = pack_1x128_32 (
1737 pix_multiply_1x128 (
1738 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
1739 expand_alpha_1x128 (unpack_32_1x128 (d))));
1746 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1747 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1748 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1750 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1751 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1752 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1754 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1755 &xmm_alpha_lo, &xmm_alpha_hi);
1757 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1758 &xmm_mask_lo, &xmm_mask_hi,
1759 &xmm_dst_lo, &xmm_dst_hi);
1761 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1762 &xmm_alpha_lo, &xmm_alpha_hi,
1763 &xmm_dst_lo, &xmm_dst_hi);
1766 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1780 *pd++ = pack_1x128_32 (
1781 pix_multiply_1x128 (
1782 pix_multiply_1x128 (
1783 unpack_32_1x128 (s), unpack_32_1x128 (m)),
1784 expand_alpha_1x128 (unpack_32_1x128 (d))));
1791 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
1794 const uint32_t * ps,
1795 const uint32_t * pm,
1800 __m128i xmm_alpha_lo, xmm_alpha_hi;
1801 __m128i xmm_src_lo, xmm_src_hi;
1802 __m128i xmm_dst_lo, xmm_dst_hi;
1803 __m128i xmm_mask_lo, xmm_mask_hi;
1805 while (w && (unsigned long)pd & 15)
1811 *pd++ = pack_1x128_32 (
1812 pix_multiply_1x128 (
1813 unpack_32_1x128 (d),
1814 pix_multiply_1x128 (unpack_32_1x128 (m),
1815 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1821 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1822 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1823 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1825 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1826 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1827 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1829 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1830 &xmm_alpha_lo, &xmm_alpha_hi);
1831 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1832 &xmm_alpha_lo, &xmm_alpha_hi,
1833 &xmm_alpha_lo, &xmm_alpha_hi);
1835 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1836 &xmm_alpha_lo, &xmm_alpha_hi,
1837 &xmm_dst_lo, &xmm_dst_hi);
1840 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1854 *pd++ = pack_1x128_32 (
1855 pix_multiply_1x128 (
1856 unpack_32_1x128 (d),
1857 pix_multiply_1x128 (unpack_32_1x128 (m),
1858 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1864 sse2_combine_out_ca (pixman_implementation_t *imp,
1867 const uint32_t * ps,
1868 const uint32_t * pm,
1873 __m128i xmm_alpha_lo, xmm_alpha_hi;
1874 __m128i xmm_src_lo, xmm_src_hi;
1875 __m128i xmm_dst_lo, xmm_dst_hi;
1876 __m128i xmm_mask_lo, xmm_mask_hi;
1878 while (w && (unsigned long)pd & 15)
1884 *pd++ = pack_1x128_32 (
1885 pix_multiply_1x128 (
1886 pix_multiply_1x128 (
1887 unpack_32_1x128 (s), unpack_32_1x128 (m)),
1888 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1894 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1895 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1896 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1898 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1899 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1900 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1902 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1903 &xmm_alpha_lo, &xmm_alpha_hi);
1904 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1905 &xmm_alpha_lo, &xmm_alpha_hi);
1907 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1908 &xmm_mask_lo, &xmm_mask_hi,
1909 &xmm_dst_lo, &xmm_dst_hi);
1910 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1911 &xmm_alpha_lo, &xmm_alpha_hi,
1912 &xmm_dst_lo, &xmm_dst_hi);
1915 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1929 *pd++ = pack_1x128_32 (
1930 pix_multiply_1x128 (
1931 pix_multiply_1x128 (
1932 unpack_32_1x128 (s), unpack_32_1x128 (m)),
1933 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1940 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
1943 const uint32_t * ps,
1944 const uint32_t * pm,
1949 __m128i xmm_alpha_lo, xmm_alpha_hi;
1950 __m128i xmm_src_lo, xmm_src_hi;
1951 __m128i xmm_dst_lo, xmm_dst_hi;
1952 __m128i xmm_mask_lo, xmm_mask_hi;
1954 while (w && (unsigned long)pd & 15)
1960 *pd++ = pack_1x128_32 (
1961 pix_multiply_1x128 (
1962 unpack_32_1x128 (d),
1963 negate_1x128 (pix_multiply_1x128 (
1964 unpack_32_1x128 (m),
1965 expand_alpha_1x128 (unpack_32_1x128 (s))))));
1971 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1972 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1973 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1975 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1976 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1977 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1979 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1980 &xmm_alpha_lo, &xmm_alpha_hi);
1982 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1983 &xmm_alpha_lo, &xmm_alpha_hi,
1984 &xmm_mask_lo, &xmm_mask_hi);
1986 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1987 &xmm_mask_lo, &xmm_mask_hi);
1989 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1990 &xmm_mask_lo, &xmm_mask_hi,
1991 &xmm_dst_lo, &xmm_dst_hi);
1994 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2008 *pd++ = pack_1x128_32 (
2009 pix_multiply_1x128 (
2010 unpack_32_1x128 (d),
2011 negate_1x128 (pix_multiply_1x128 (
2012 unpack_32_1x128 (m),
2013 expand_alpha_1x128 (unpack_32_1x128 (s))))));
2018 static force_inline uint32_t
2019 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2023 __m128i m = unpack_32_1x128 (mask);
2024 __m128i s = unpack_32_1x128 (src);
2025 __m128i d = unpack_32_1x128 (dst);
2026 __m128i sa = expand_alpha_1x128 (s);
2027 __m128i da = expand_alpha_1x128 (d);
2029 s = pix_multiply_1x128 (s, m);
2030 m = negate_1x128 (pix_multiply_1x128 (m, sa));
2032 return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2036 sse2_combine_atop_ca (pixman_implementation_t *imp,
2039 const uint32_t * ps,
2040 const uint32_t * pm,
2045 __m128i xmm_src_lo, xmm_src_hi;
2046 __m128i xmm_dst_lo, xmm_dst_hi;
2047 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2048 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2049 __m128i xmm_mask_lo, xmm_mask_hi;
2051 while (w && (unsigned long)pd & 15)
2057 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2063 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2064 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2065 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2067 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2068 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2069 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2071 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2072 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2073 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2074 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2076 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2077 &xmm_mask_lo, &xmm_mask_hi,
2078 &xmm_src_lo, &xmm_src_hi);
2079 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2080 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2081 &xmm_mask_lo, &xmm_mask_hi);
2083 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2085 pix_add_multiply_2x128 (
2086 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2087 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2088 &xmm_dst_lo, &xmm_dst_hi);
2091 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2105 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2110 static force_inline uint32_t
2111 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2115 __m128i m = unpack_32_1x128 (mask);
2116 __m128i s = unpack_32_1x128 (src);
2117 __m128i d = unpack_32_1x128 (dst);
2119 __m128i da = negate_1x128 (expand_alpha_1x128 (d));
2120 __m128i sa = expand_alpha_1x128 (s);
2122 s = pix_multiply_1x128 (s, m);
2123 m = pix_multiply_1x128 (m, sa);
2125 return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2129 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2132 const uint32_t * ps,
2133 const uint32_t * pm,
2138 __m128i xmm_src_lo, xmm_src_hi;
2139 __m128i xmm_dst_lo, xmm_dst_hi;
2140 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2141 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2142 __m128i xmm_mask_lo, xmm_mask_hi;
2144 while (w && (unsigned long)pd & 15)
2150 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2156 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2157 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2158 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2160 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2161 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2162 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2164 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2165 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2166 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2167 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2169 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2170 &xmm_mask_lo, &xmm_mask_hi,
2171 &xmm_src_lo, &xmm_src_hi);
2172 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2173 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2174 &xmm_mask_lo, &xmm_mask_hi);
2176 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2177 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2179 pix_add_multiply_2x128 (
2180 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2181 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2182 &xmm_dst_lo, &xmm_dst_hi);
2185 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2199 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2204 static force_inline uint32_t
2205 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2209 __m128i a = unpack_32_1x128 (mask);
2210 __m128i s = unpack_32_1x128 (src);
2211 __m128i d = unpack_32_1x128 (dst);
2213 __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
2214 a, expand_alpha_1x128 (s)));
2215 __m128i dest = pix_multiply_1x128 (s, a);
2216 __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
2218 return pack_1x128_32 (pix_add_multiply_1x128 (&d,
2225 sse2_combine_xor_ca (pixman_implementation_t *imp,
2228 const uint32_t * ps,
2229 const uint32_t * pm,
2234 __m128i xmm_src_lo, xmm_src_hi;
2235 __m128i xmm_dst_lo, xmm_dst_hi;
2236 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2237 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2238 __m128i xmm_mask_lo, xmm_mask_hi;
2240 while (w && (unsigned long)pd & 15)
2246 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2252 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2253 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2254 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2256 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2257 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2258 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2260 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2261 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2262 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2263 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2265 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2266 &xmm_mask_lo, &xmm_mask_hi,
2267 &xmm_src_lo, &xmm_src_hi);
2268 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2269 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2270 &xmm_mask_lo, &xmm_mask_hi);
2272 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2273 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2274 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2275 &xmm_mask_lo, &xmm_mask_hi);
2277 pix_add_multiply_2x128 (
2278 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2279 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2280 &xmm_dst_lo, &xmm_dst_hi);
2283 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2297 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2303 sse2_combine_add_ca (pixman_implementation_t *imp,
2306 const uint32_t * ps,
2307 const uint32_t * pm,
2312 __m128i xmm_src_lo, xmm_src_hi;
2313 __m128i xmm_dst_lo, xmm_dst_hi;
2314 __m128i xmm_mask_lo, xmm_mask_hi;
2316 while (w && (unsigned long)pd & 15)
2322 *pd++ = pack_1x128_32 (
2323 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2324 unpack_32_1x128 (m)),
2325 unpack_32_1x128 (d)));
2331 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2332 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2333 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2335 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2336 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2337 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2339 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2340 &xmm_mask_lo, &xmm_mask_hi,
2341 &xmm_src_lo, &xmm_src_hi);
2344 (__m128i*)pd, pack_2x128_128 (
2345 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2346 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2360 *pd++ = pack_1x128_32 (
2361 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2362 unpack_32_1x128 (m)),
2363 unpack_32_1x128 (d)));
2368 /* ---------------------------------------------------
2369 * fb_compose_setup_sSE2
2371 static force_inline __m128i
2372 create_mask_16_128 (uint16_t mask)
2374 return _mm_set1_epi16 (mask);
2377 /* Work around a code generation bug in Sun Studio 12. */
2378 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2379 # define create_mask_2x32_128(mask0, mask1) \
2380 (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2382 static force_inline __m128i
2383 create_mask_2x32_128 (uint32_t mask0,
2386 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2390 /* -------------------------------------------------------------------
2391 * composite_over_n_8888
2395 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2397 pixman_image_t * src_image,
2398 pixman_image_t * mask_image,
2399 pixman_image_t * dst_image,
2410 uint32_t *dst_line, *dst, d;
2413 __m128i xmm_src, xmm_alpha;
2414 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2416 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2421 PIXMAN_IMAGE_GET_LINE (
2422 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2424 xmm_src = expand_pixel_32_1x128 (src);
2425 xmm_alpha = expand_alpha_1x128 (xmm_src);
2431 dst_line += dst_stride;
2434 while (w && (unsigned long)dst & 15)
2437 *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2439 unpack_32_1x128 (d)));
2445 xmm_dst = load_128_aligned ((__m128i*)dst);
2447 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2449 over_2x128 (&xmm_src, &xmm_src,
2450 &xmm_alpha, &xmm_alpha,
2451 &xmm_dst_lo, &xmm_dst_hi);
2453 /* rebuid the 4 pixel data and save*/
2455 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2464 *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2466 unpack_32_1x128 (d)));
2473 /* ---------------------------------------------------------------------
2474 * composite_over_n_0565
2477 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2479 pixman_image_t * src_image,
2480 pixman_image_t * mask_image,
2481 pixman_image_t * dst_image,
2492 uint16_t *dst_line, *dst, d;
2495 __m128i xmm_src, xmm_alpha;
2496 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2498 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2503 PIXMAN_IMAGE_GET_LINE (
2504 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2506 xmm_src = expand_pixel_32_1x128 (src);
2507 xmm_alpha = expand_alpha_1x128 (xmm_src);
2513 dst_line += dst_stride;
2516 while (w && (unsigned long)dst & 15)
2520 *dst++ = pack_565_32_16 (
2521 pack_1x128_32 (over_1x128 (xmm_src,
2523 expand565_16_1x128 (d))));
2529 xmm_dst = load_128_aligned ((__m128i*)dst);
2531 unpack_565_128_4x128 (xmm_dst,
2532 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2534 over_2x128 (&xmm_src, &xmm_src,
2535 &xmm_alpha, &xmm_alpha,
2536 &xmm_dst0, &xmm_dst1);
2537 over_2x128 (&xmm_src, &xmm_src,
2538 &xmm_alpha, &xmm_alpha,
2539 &xmm_dst2, &xmm_dst3);
2541 xmm_dst = pack_565_4x128_128 (
2542 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2544 save_128_aligned ((__m128i*)dst, xmm_dst);
2553 *dst++ = pack_565_32_16 (
2554 pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
2555 expand565_16_1x128 (d))));
2561 /* ------------------------------
2562 * composite_add_n_8888_8888_ca
2565 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2567 pixman_image_t * src_image,
2568 pixman_image_t * mask_image,
2569 pixman_image_t * dst_image,
2580 uint32_t *dst_line, d;
2581 uint32_t *mask_line, m;
2583 int dst_stride, mask_stride;
2585 __m128i xmm_src, xmm_alpha;
2587 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2589 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2591 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2597 PIXMAN_IMAGE_GET_LINE (
2598 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2599 PIXMAN_IMAGE_GET_LINE (
2600 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2602 xmm_src = _mm_unpacklo_epi8 (
2603 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2604 xmm_alpha = expand_alpha_1x128 (xmm_src);
2606 mmx_alpha = xmm_alpha;
2611 const uint32_t *pm = (uint32_t *)mask_line;
2612 uint32_t *pd = (uint32_t *)dst_line;
2614 dst_line += dst_stride;
2615 mask_line += mask_stride;
2617 while (w && (unsigned long)pd & 15)
2625 mmx_mask = unpack_32_1x128 (m);
2626 mmx_dest = unpack_32_1x128 (d);
2628 *pd = pack_1x128_32 (
2629 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest));
2638 xmm_mask = load_128_unaligned ((__m128i*)pm);
2642 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2644 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2645 if (pack_cmp != 0xffff)
2647 xmm_dst = load_128_aligned ((__m128i*)pd);
2649 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2651 pix_multiply_2x128 (&xmm_src, &xmm_src,
2652 &xmm_mask_lo, &xmm_mask_hi,
2653 &xmm_mask_lo, &xmm_mask_hi);
2654 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2657 (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2673 mmx_mask = unpack_32_1x128 (m);
2674 mmx_dest = unpack_32_1x128 (d);
2676 *pd = pack_1x128_32 (
2677 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest));
2687 /* ---------------------------------------------------------------------------
2688 * composite_over_n_8888_8888_ca
2692 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2694 pixman_image_t * src_image,
2695 pixman_image_t * mask_image,
2696 pixman_image_t * dst_image,
2707 uint32_t *dst_line, d;
2708 uint32_t *mask_line, m;
2710 int dst_stride, mask_stride;
2712 __m128i xmm_src, xmm_alpha;
2713 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2714 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2716 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2718 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
2723 PIXMAN_IMAGE_GET_LINE (
2724 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2725 PIXMAN_IMAGE_GET_LINE (
2726 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2728 xmm_src = _mm_unpacklo_epi8 (
2729 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2730 xmm_alpha = expand_alpha_1x128 (xmm_src);
2732 mmx_alpha = xmm_alpha;
2737 const uint32_t *pm = (uint32_t *)mask_line;
2738 uint32_t *pd = (uint32_t *)dst_line;
2740 dst_line += dst_stride;
2741 mask_line += mask_stride;
2743 while (w && (unsigned long)pd & 15)
2750 mmx_mask = unpack_32_1x128 (m);
2751 mmx_dest = unpack_32_1x128 (d);
2753 *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
2765 xmm_mask = load_128_unaligned ((__m128i*)pm);
2769 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2771 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2772 if (pack_cmp != 0xffff)
2774 xmm_dst = load_128_aligned ((__m128i*)pd);
2776 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2777 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2779 in_over_2x128 (&xmm_src, &xmm_src,
2780 &xmm_alpha, &xmm_alpha,
2781 &xmm_mask_lo, &xmm_mask_hi,
2782 &xmm_dst_lo, &xmm_dst_hi);
2785 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2800 mmx_mask = unpack_32_1x128 (m);
2801 mmx_dest = unpack_32_1x128 (d);
2803 *pd = pack_1x128_32 (
2804 in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2814 /*---------------------------------------------------------------------
2815 * composite_over_8888_n_8888
2819 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
2821 pixman_image_t * src_image,
2822 pixman_image_t * mask_image,
2823 pixman_image_t * dst_image,
2833 uint32_t *dst_line, *dst;
2834 uint32_t *src_line, *src;
2837 int dst_stride, src_stride;
2840 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2841 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2842 __m128i xmm_alpha_lo, xmm_alpha_hi;
2844 PIXMAN_IMAGE_GET_LINE (
2845 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2846 PIXMAN_IMAGE_GET_LINE (
2847 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2849 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2851 xmm_mask = create_mask_16_128 (mask >> 24);
2856 dst_line += dst_stride;
2858 src_line += src_stride;
2861 while (w && (unsigned long)dst & 15)
2863 uint32_t s = *src++;
2869 __m128i ms = unpack_32_1x128 (s);
2870 __m128i alpha = expand_alpha_1x128 (ms);
2871 __m128i dest = xmm_mask;
2872 __m128i alpha_dst = unpack_32_1x128 (d);
2874 *dst = pack_1x128_32 (
2875 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
2883 xmm_src = load_128_unaligned ((__m128i*)src);
2885 if (!is_zero (xmm_src))
2887 xmm_dst = load_128_aligned ((__m128i*)dst);
2889 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2890 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2891 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2892 &xmm_alpha_lo, &xmm_alpha_hi);
2894 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
2895 &xmm_alpha_lo, &xmm_alpha_hi,
2896 &xmm_mask, &xmm_mask,
2897 &xmm_dst_lo, &xmm_dst_hi);
2900 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2910 uint32_t s = *src++;
2916 __m128i ms = unpack_32_1x128 (s);
2917 __m128i alpha = expand_alpha_1x128 (ms);
2918 __m128i mask = xmm_mask;
2919 __m128i dest = unpack_32_1x128 (d);
2921 *dst = pack_1x128_32 (
2922 in_over_1x128 (&ms, &alpha, &mask, &dest));
2932 /*---------------------------------------------------------------------
2933 * composite_over_8888_n_8888
2937 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
2939 pixman_image_t * src_image,
2940 pixman_image_t * mask_image,
2941 pixman_image_t * dst_image,
2951 uint32_t *dst_line, *dst;
2952 uint32_t *src_line, *src;
2954 int dst_stride, src_stride;
2957 PIXMAN_IMAGE_GET_LINE (
2958 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2959 PIXMAN_IMAGE_GET_LINE (
2960 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2965 dst_line += dst_stride;
2967 src_line += src_stride;
2970 while (w && (unsigned long)dst & 15)
2972 *dst++ = *src++ | 0xff000000;
2978 __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
2980 xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
2981 xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
2982 xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
2983 xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
2985 save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
2986 save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
2987 save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
2988 save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
2997 *dst++ = *src++ | 0xff000000;
3004 /* ---------------------------------------------------------------------
3005 * composite_over_x888_n_8888
3008 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3010 pixman_image_t * src_image,
3011 pixman_image_t * mask_image,
3012 pixman_image_t * dst_image,
3022 uint32_t *dst_line, *dst;
3023 uint32_t *src_line, *src;
3025 int dst_stride, src_stride;
3028 __m128i xmm_mask, xmm_alpha;
3029 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3030 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3032 PIXMAN_IMAGE_GET_LINE (
3033 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3034 PIXMAN_IMAGE_GET_LINE (
3035 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3037 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
3039 xmm_mask = create_mask_16_128 (mask >> 24);
3040 xmm_alpha = mask_00ff;
3045 dst_line += dst_stride;
3047 src_line += src_stride;
3050 while (w && (unsigned long)dst & 15)
3052 uint32_t s = (*src++) | 0xff000000;
3055 __m128i src = unpack_32_1x128 (s);
3056 __m128i alpha = xmm_alpha;
3057 __m128i mask = xmm_mask;
3058 __m128i dest = unpack_32_1x128 (d);
3060 *dst++ = pack_1x128_32 (
3061 in_over_1x128 (&src, &alpha, &mask, &dest));
3068 xmm_src = _mm_or_si128 (
3069 load_128_unaligned ((__m128i*)src), mask_ff000000);
3070 xmm_dst = load_128_aligned ((__m128i*)dst);
3072 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3073 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3075 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3076 &xmm_alpha, &xmm_alpha,
3077 &xmm_mask, &xmm_mask,
3078 &xmm_dst_lo, &xmm_dst_hi);
3081 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3091 uint32_t s = (*src++) | 0xff000000;
3094 __m128i src = unpack_32_1x128 (s);
3095 __m128i alpha = xmm_alpha;
3096 __m128i mask = xmm_mask;
3097 __m128i dest = unpack_32_1x128 (d);
3099 *dst++ = pack_1x128_32 (
3100 in_over_1x128 (&src, &alpha, &mask, &dest));
3108 /* --------------------------------------------------------------------
3109 * composite_over_8888_8888
3112 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3114 pixman_image_t * src_image,
3115 pixman_image_t * mask_image,
3116 pixman_image_t * dst_image,
3126 int dst_stride, src_stride;
3127 uint32_t *dst_line, *dst;
3128 uint32_t *src_line, *src;
3130 PIXMAN_IMAGE_GET_LINE (
3131 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3132 PIXMAN_IMAGE_GET_LINE (
3133 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3140 sse2_combine_over_u (imp, op, dst, src, NULL, width);
3147 /* ------------------------------------------------------------------
3148 * composite_over_8888_0565
3150 static force_inline uint16_t
3151 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3155 ms = unpack_32_1x128 (src);
3156 return pack_565_32_16 (
3159 ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
3163 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3165 pixman_image_t * src_image,
3166 pixman_image_t * mask_image,
3167 pixman_image_t * dst_image,
3177 uint16_t *dst_line, *dst, d;
3178 uint32_t *src_line, *src, s;
3179 int dst_stride, src_stride;
3182 __m128i xmm_alpha_lo, xmm_alpha_hi;
3183 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3184 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3186 PIXMAN_IMAGE_GET_LINE (
3187 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3188 PIXMAN_IMAGE_GET_LINE (
3189 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3194 * I copy the code from MMX one and keep the fixme.
3195 * If it's a problem there, probably is a problem here.
3197 assert (src_image->drawable == mask_image->drawable);
3205 dst_line += dst_stride;
3206 src_line += src_stride;
3209 /* Align dst on a 16-byte boundary */
3211 ((unsigned long)dst & 15))
3216 *dst++ = composite_over_8888_0565pixel (s, d);
3220 /* It's a 8 pixel loop */
3223 /* I'm loading unaligned because I'm not sure
3224 * about the address alignment.
3226 xmm_src = load_128_unaligned ((__m128i*) src);
3227 xmm_dst = load_128_aligned ((__m128i*) dst);
3230 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3231 unpack_565_128_4x128 (xmm_dst,
3232 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3233 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3234 &xmm_alpha_lo, &xmm_alpha_hi);
3236 /* I'm loading next 4 pixels from memory
3237 * before to optimze the memory read.
3239 xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3241 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3242 &xmm_alpha_lo, &xmm_alpha_hi,
3243 &xmm_dst0, &xmm_dst1);
3246 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3247 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3248 &xmm_alpha_lo, &xmm_alpha_hi);
3250 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3251 &xmm_alpha_lo, &xmm_alpha_hi,
3252 &xmm_dst2, &xmm_dst3);
3255 (__m128i*)dst, pack_565_4x128_128 (
3256 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3268 *dst++ = composite_over_8888_0565pixel (s, d);
3274 /* -----------------------------------------------------------------
3275 * composite_over_n_8_8888
3279 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3281 pixman_image_t * src_image,
3282 pixman_image_t * mask_image,
3283 pixman_image_t * dst_image,
3294 uint32_t *dst_line, *dst;
3295 uint8_t *mask_line, *mask;
3296 int dst_stride, mask_stride;
3300 __m128i xmm_src, xmm_alpha, xmm_def;
3301 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3302 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3304 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3306 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3312 PIXMAN_IMAGE_GET_LINE (
3313 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3314 PIXMAN_IMAGE_GET_LINE (
3315 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3317 xmm_def = create_mask_2x32_128 (src, src);
3318 xmm_src = expand_pixel_32_1x128 (src);
3319 xmm_alpha = expand_alpha_1x128 (xmm_src);
3321 mmx_alpha = xmm_alpha;
3326 dst_line += dst_stride;
3328 mask_line += mask_stride;
3331 while (w && (unsigned long)dst & 15)
3333 uint8_t m = *mask++;
3338 mmx_mask = expand_pixel_8_1x128 (m);
3339 mmx_dest = unpack_32_1x128 (d);
3341 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3353 m = *((uint32_t*)mask);
3355 if (srca == 0xff && m == 0xffffffff)
3357 save_128_aligned ((__m128i*)dst, xmm_def);
3361 xmm_dst = load_128_aligned ((__m128i*) dst);
3362 xmm_mask = unpack_32_1x128 (m);
3363 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3366 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3367 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3369 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3370 &xmm_mask_lo, &xmm_mask_hi);
3372 in_over_2x128 (&xmm_src, &xmm_src,
3373 &xmm_alpha, &xmm_alpha,
3374 &xmm_mask_lo, &xmm_mask_hi,
3375 &xmm_dst_lo, &xmm_dst_hi);
3378 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3388 uint8_t m = *mask++;
3393 mmx_mask = expand_pixel_8_1x128 (m);
3394 mmx_dest = unpack_32_1x128 (d);
3396 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3409 /* ----------------------------------------------------------------
3410 * composite_over_n_8_8888
3414 pixman_fill_sse2 (uint32_t *bits,
3423 uint32_t byte_width;
3433 stride = stride * (int) sizeof (uint32_t) / 1;
3434 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3440 data = (w << 16) | w;
3444 stride = stride * (int) sizeof (uint32_t) / 2;
3445 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3446 byte_width = 2 * width;
3449 data = (data & 0xffff) * 0x00010001;
3453 stride = stride * (int) sizeof (uint32_t) / 4;
3454 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3455 byte_width = 4 * width;
3463 xmm_def = create_mask_2x32_128 (data, data);
3468 uint8_t *d = byte_line;
3469 byte_line += stride;
3472 while (w >= 1 && ((unsigned long)d & 1))
3474 *(uint8_t *)d = data;
3479 while (w >= 2 && ((unsigned long)d & 3))
3481 *(uint16_t *)d = data;
3486 while (w >= 4 && ((unsigned long)d & 15))
3488 *(uint32_t *)d = data;
3496 save_128_aligned ((__m128i*)(d), xmm_def);
3497 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3498 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3499 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3500 save_128_aligned ((__m128i*)(d + 64), xmm_def);
3501 save_128_aligned ((__m128i*)(d + 80), xmm_def);
3502 save_128_aligned ((__m128i*)(d + 96), xmm_def);
3503 save_128_aligned ((__m128i*)(d + 112), xmm_def);
3511 save_128_aligned ((__m128i*)(d), xmm_def);
3512 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3513 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3514 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3522 save_128_aligned ((__m128i*)(d), xmm_def);
3523 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3531 save_128_aligned ((__m128i*)(d), xmm_def);
3539 *(uint32_t *)d = data;
3547 *(uint16_t *)d = data;
3554 *(uint8_t *)d = data;
3564 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3566 pixman_image_t * src_image,
3567 pixman_image_t * mask_image,
3568 pixman_image_t * dst_image,
3579 uint32_t *dst_line, *dst;
3580 uint8_t *mask_line, *mask;
3581 int dst_stride, mask_stride;
3585 __m128i xmm_src, xmm_def;
3586 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3588 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3593 pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
3594 PIXMAN_FORMAT_BPP (dst_image->bits.format),
3595 dest_x, dest_y, width, height, 0);
3599 PIXMAN_IMAGE_GET_LINE (
3600 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3601 PIXMAN_IMAGE_GET_LINE (
3602 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3604 xmm_def = create_mask_2x32_128 (src, src);
3605 xmm_src = expand_pixel_32_1x128 (src);
3610 dst_line += dst_stride;
3612 mask_line += mask_stride;
3615 while (w && (unsigned long)dst & 15)
3617 uint8_t m = *mask++;
3621 *dst = pack_1x128_32 (
3622 pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
3635 m = *((uint32_t*)mask);
3637 if (srca == 0xff && m == 0xffffffff)
3639 save_128_aligned ((__m128i*)dst, xmm_def);
3643 xmm_mask = unpack_32_1x128 (m);
3644 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3647 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3649 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3650 &xmm_mask_lo, &xmm_mask_hi);
3652 pix_multiply_2x128 (&xmm_src, &xmm_src,
3653 &xmm_mask_lo, &xmm_mask_hi,
3654 &xmm_mask_lo, &xmm_mask_hi);
3657 (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3661 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3671 uint8_t m = *mask++;
3675 *dst = pack_1x128_32 (
3676 pix_multiply_1x128 (
3677 xmm_src, expand_pixel_8_1x128 (m)));
3691 /*-----------------------------------------------------------------------
3692 * composite_over_n_8_0565
3696 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3698 pixman_image_t * src_image,
3699 pixman_image_t * mask_image,
3700 pixman_image_t * dst_image,
3711 uint16_t *dst_line, *dst, d;
3712 uint8_t *mask_line, *mask;
3713 int dst_stride, mask_stride;
3716 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3718 __m128i xmm_src, xmm_alpha;
3719 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3720 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3722 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
3728 PIXMAN_IMAGE_GET_LINE (
3729 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3730 PIXMAN_IMAGE_GET_LINE (
3731 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3733 xmm_src = expand_pixel_32_1x128 (src);
3734 xmm_alpha = expand_alpha_1x128 (xmm_src);
3736 mmx_alpha = xmm_alpha;
3741 dst_line += dst_stride;
3743 mask_line += mask_stride;
3746 while (w && (unsigned long)dst & 15)
3753 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3754 mmx_dest = expand565_16_1x128 (d);
3756 *dst = pack_565_32_16 (
3759 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3768 xmm_dst = load_128_aligned ((__m128i*) dst);
3769 unpack_565_128_4x128 (xmm_dst,
3770 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3772 m = *((uint32_t*)mask);
3777 xmm_mask = unpack_32_1x128 (m);
3778 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3781 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3783 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3784 &xmm_mask_lo, &xmm_mask_hi);
3786 in_over_2x128 (&xmm_src, &xmm_src,
3787 &xmm_alpha, &xmm_alpha,
3788 &xmm_mask_lo, &xmm_mask_hi,
3789 &xmm_dst0, &xmm_dst1);
3792 m = *((uint32_t*)mask);
3797 xmm_mask = unpack_32_1x128 (m);
3798 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3801 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3803 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3804 &xmm_mask_lo, &xmm_mask_hi);
3805 in_over_2x128 (&xmm_src, &xmm_src,
3806 &xmm_alpha, &xmm_alpha,
3807 &xmm_mask_lo, &xmm_mask_hi,
3808 &xmm_dst2, &xmm_dst3);
3812 (__m128i*)dst, pack_565_4x128_128 (
3813 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3826 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3827 mmx_dest = expand565_16_1x128 (d);
3829 *dst = pack_565_32_16 (
3832 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3842 /* -----------------------------------------------------------------------
3843 * composite_over_pixbuf_0565
3847 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3849 pixman_image_t * src_image,
3850 pixman_image_t * mask_image,
3851 pixman_image_t * dst_image,
3861 uint16_t *dst_line, *dst, d;
3862 uint32_t *src_line, *src, s;
3863 int dst_stride, src_stride;
3865 uint32_t opaque, zero;
3868 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3869 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3871 PIXMAN_IMAGE_GET_LINE (
3872 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3873 PIXMAN_IMAGE_GET_LINE (
3874 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3879 * I copy the code from MMX one and keep the fixme.
3880 * If it's a problem there, probably is a problem here.
3882 assert (src_image->drawable == mask_image->drawable);
3888 dst_line += dst_stride;
3890 src_line += src_stride;
3893 while (w && (unsigned long)dst & 15)
3898 ms = unpack_32_1x128 (s);
3900 *dst++ = pack_565_32_16 (
3902 over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3909 xmm_src = load_128_unaligned ((__m128i*)src);
3910 xmm_dst = load_128_aligned ((__m128i*)dst);
3912 opaque = is_opaque (xmm_src);
3913 zero = is_zero (xmm_src);
3915 unpack_565_128_4x128 (xmm_dst,
3916 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3917 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3919 /* preload next round*/
3920 xmm_src = load_128_unaligned ((__m128i*)(src + 4));
3924 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3925 &xmm_dst0, &xmm_dst1);
3929 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3930 &xmm_dst0, &xmm_dst1);
3934 opaque = is_opaque (xmm_src);
3935 zero = is_zero (xmm_src);
3937 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3941 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3942 &xmm_dst2, &xmm_dst3);
3946 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3947 &xmm_dst2, &xmm_dst3);
3951 (__m128i*)dst, pack_565_4x128_128 (
3952 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3964 ms = unpack_32_1x128 (s);
3966 *dst++ = pack_565_32_16 (
3968 over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3975 /* -------------------------------------------------------------------------
3976 * composite_over_pixbuf_8888
3980 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3982 pixman_image_t * src_image,
3983 pixman_image_t * mask_image,
3984 pixman_image_t * dst_image,
3994 uint32_t *dst_line, *dst, d;
3995 uint32_t *src_line, *src, s;
3996 int dst_stride, src_stride;
3998 uint32_t opaque, zero;
4000 __m128i xmm_src_lo, xmm_src_hi;
4001 __m128i xmm_dst_lo, xmm_dst_hi;
4003 PIXMAN_IMAGE_GET_LINE (
4004 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4005 PIXMAN_IMAGE_GET_LINE (
4006 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4011 * I copy the code from MMX one and keep the fixme.
4012 * If it's a problem there, probably is a problem here.
4014 assert (src_image->drawable == mask_image->drawable);
4020 dst_line += dst_stride;
4022 src_line += src_stride;
4025 while (w && (unsigned long)dst & 15)
4030 *dst++ = pack_1x128_32 (
4031 over_rev_non_pre_1x128 (
4032 unpack_32_1x128 (s), unpack_32_1x128 (d)));
4039 xmm_src_hi = load_128_unaligned ((__m128i*)src);
4041 opaque = is_opaque (xmm_src_hi);
4042 zero = is_zero (xmm_src_hi);
4044 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
4048 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
4049 &xmm_dst_lo, &xmm_dst_hi);
4052 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4056 xmm_dst_hi = load_128_aligned ((__m128i*)dst);
4058 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
4060 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
4061 &xmm_dst_lo, &xmm_dst_hi);
4064 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4077 *dst++ = pack_1x128_32 (
4078 over_rev_non_pre_1x128 (
4079 unpack_32_1x128 (s), unpack_32_1x128 (d)));
4087 /* -------------------------------------------------------------------------------------------------
4088 * composite_over_n_8888_0565_ca
4092 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
4094 pixman_image_t * src_image,
4095 pixman_image_t * mask_image,
4096 pixman_image_t * dst_image,
4107 uint16_t *dst_line, *dst, d;
4108 uint32_t *mask_line, *mask, m;
4109 int dst_stride, mask_stride;
4113 __m128i xmm_src, xmm_alpha;
4114 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4115 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
4117 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
4119 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4124 PIXMAN_IMAGE_GET_LINE (
4125 dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
4126 PIXMAN_IMAGE_GET_LINE (
4127 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
4129 xmm_src = expand_pixel_32_1x128 (src);
4130 xmm_alpha = expand_alpha_1x128 (xmm_src);
4132 mmx_alpha = xmm_alpha;
4139 mask_line += mask_stride;
4140 dst_line += dst_stride;
4142 while (w && ((unsigned long)dst & 15))
4144 m = *(uint32_t *) mask;
4149 mmx_mask = unpack_32_1x128 (m);
4150 mmx_dest = expand565_16_1x128 (d);
4152 *dst = pack_565_32_16 (
4155 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4166 xmm_mask = load_128_unaligned ((__m128i*)mask);
4167 xmm_dst = load_128_aligned ((__m128i*)dst);
4169 pack_cmp = _mm_movemask_epi8 (
4170 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4172 unpack_565_128_4x128 (xmm_dst,
4173 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
4174 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4176 /* preload next round */
4177 xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4179 /* preload next round */
4180 if (pack_cmp != 0xffff)
4182 in_over_2x128 (&xmm_src, &xmm_src,
4183 &xmm_alpha, &xmm_alpha,
4184 &xmm_mask_lo, &xmm_mask_hi,
4185 &xmm_dst0, &xmm_dst1);
4189 pack_cmp = _mm_movemask_epi8 (
4190 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4192 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4194 if (pack_cmp != 0xffff)
4196 in_over_2x128 (&xmm_src, &xmm_src,
4197 &xmm_alpha, &xmm_alpha,
4198 &xmm_mask_lo, &xmm_mask_hi,
4199 &xmm_dst2, &xmm_dst3);
4203 (__m128i*)dst, pack_565_4x128_128 (
4204 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4213 m = *(uint32_t *) mask;
4218 mmx_mask = unpack_32_1x128 (m);
4219 mmx_dest = expand565_16_1x128 (d);
4221 *dst = pack_565_32_16 (
4224 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4235 /* -----------------------------------------------------------------------
4236 * composite_in_n_8_8
4240 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4242 pixman_image_t * src_image,
4243 pixman_image_t * mask_image,
4244 pixman_image_t * dst_image,
4254 uint8_t *dst_line, *dst;
4255 uint8_t *mask_line, *mask;
4256 int dst_stride, mask_stride;
4263 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4264 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4266 PIXMAN_IMAGE_GET_LINE (
4267 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4268 PIXMAN_IMAGE_GET_LINE (
4269 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4271 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4275 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4280 dst_line += dst_stride;
4282 mask_line += mask_stride;
4285 while (w && ((unsigned long)dst & 15))
4287 m = (uint32_t) *mask++;
4288 d = (uint32_t) *dst;
4290 *dst++ = (uint8_t) pack_1x128_32 (
4291 pix_multiply_1x128 (
4292 pix_multiply_1x128 (xmm_alpha,
4293 unpack_32_1x128 (m)),
4294 unpack_32_1x128 (d)));
4300 xmm_mask = load_128_unaligned ((__m128i*)mask);
4301 xmm_dst = load_128_aligned ((__m128i*)dst);
4303 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4304 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4306 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4307 &xmm_mask_lo, &xmm_mask_hi,
4308 &xmm_mask_lo, &xmm_mask_hi);
4310 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4311 &xmm_dst_lo, &xmm_dst_hi,
4312 &xmm_dst_lo, &xmm_dst_hi);
4315 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4324 m = (uint32_t) *mask++;
4325 d = (uint32_t) *dst;
4327 *dst++ = (uint8_t) pack_1x128_32 (
4328 pix_multiply_1x128 (
4329 pix_multiply_1x128 (
4330 xmm_alpha, unpack_32_1x128 (m)),
4331 unpack_32_1x128 (d)));
4338 /* -----------------------------------------------------------------------
4343 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4345 pixman_image_t * src_image,
4346 pixman_image_t * mask_image,
4347 pixman_image_t * dst_image,
4357 uint8_t *dst_line, *dst;
4364 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4366 PIXMAN_IMAGE_GET_LINE (
4367 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4369 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4371 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4380 pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4381 8, dest_x, dest_y, width, height, src);
4389 dst_line += dst_stride;
4392 while (w && ((unsigned long)dst & 15))
4394 d = (uint32_t) *dst;
4396 *dst++ = (uint8_t) pack_1x128_32 (
4397 pix_multiply_1x128 (
4399 unpack_32_1x128 (d)));
4405 xmm_dst = load_128_aligned ((__m128i*)dst);
4407 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4409 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4410 &xmm_dst_lo, &xmm_dst_hi,
4411 &xmm_dst_lo, &xmm_dst_hi);
4414 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4422 d = (uint32_t) *dst;
4424 *dst++ = (uint8_t) pack_1x128_32 (
4425 pix_multiply_1x128 (
4427 unpack_32_1x128 (d)));
4434 /* ---------------------------------------------------------------------------
4439 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4441 pixman_image_t * src_image,
4442 pixman_image_t * mask_image,
4443 pixman_image_t * dst_image,
4453 uint8_t *dst_line, *dst;
4454 uint8_t *src_line, *src;
4455 int src_stride, dst_stride;
4459 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4460 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4462 PIXMAN_IMAGE_GET_LINE (
4463 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4464 PIXMAN_IMAGE_GET_LINE (
4465 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4470 dst_line += dst_stride;
4472 src_line += src_stride;
4475 while (w && ((unsigned long)dst & 15))
4477 s = (uint32_t) *src++;
4478 d = (uint32_t) *dst;
4480 *dst++ = (uint8_t) pack_1x128_32 (
4481 pix_multiply_1x128 (
4482 unpack_32_1x128 (s), unpack_32_1x128 (d)));
4488 xmm_src = load_128_unaligned ((__m128i*)src);
4489 xmm_dst = load_128_aligned ((__m128i*)dst);
4491 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4492 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4494 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4495 &xmm_dst_lo, &xmm_dst_hi,
4496 &xmm_dst_lo, &xmm_dst_hi);
4499 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4508 s = (uint32_t) *src++;
4509 d = (uint32_t) *dst;
4511 *dst++ = (uint8_t) pack_1x128_32 (
4512 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
4519 /* -------------------------------------------------------------------------
4520 * composite_add_n_8_8
4524 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4526 pixman_image_t * src_image,
4527 pixman_image_t * mask_image,
4528 pixman_image_t * dst_image,
4538 uint8_t *dst_line, *dst;
4539 uint8_t *mask_line, *mask;
4540 int dst_stride, mask_stride;
4547 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4548 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4550 PIXMAN_IMAGE_GET_LINE (
4551 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4552 PIXMAN_IMAGE_GET_LINE (
4553 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4555 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4559 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4564 dst_line += dst_stride;
4566 mask_line += mask_stride;
4569 while (w && ((unsigned long)dst & 15))
4571 m = (uint32_t) *mask++;
4572 d = (uint32_t) *dst;
4574 *dst++ = (uint8_t) pack_1x128_32 (
4576 pix_multiply_1x128 (
4577 xmm_alpha, unpack_32_1x128 (m)),
4578 unpack_32_1x128 (d)));
4584 xmm_mask = load_128_unaligned ((__m128i*)mask);
4585 xmm_dst = load_128_aligned ((__m128i*)dst);
4587 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4588 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4590 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4591 &xmm_mask_lo, &xmm_mask_hi,
4592 &xmm_mask_lo, &xmm_mask_hi);
4594 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4595 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4598 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4607 m = (uint32_t) *mask++;
4608 d = (uint32_t) *dst;
4610 *dst++ = (uint8_t) pack_1x128_32 (
4612 pix_multiply_1x128 (
4613 xmm_alpha, unpack_32_1x128 (m)),
4614 unpack_32_1x128 (d)));
4622 /* -------------------------------------------------------------------------
4623 * composite_add_n_8_8
4627 sse2_composite_add_n_8 (pixman_implementation_t *imp,
4629 pixman_image_t * src_image,
4630 pixman_image_t * mask_image,
4631 pixman_image_t * dst_image,
4641 uint8_t *dst_line, *dst;
4648 PIXMAN_IMAGE_GET_LINE (
4649 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4651 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
4660 pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
4661 8, dest_x, dest_y, width, height, 0xff);
4666 src = (src << 24) | (src << 16) | (src << 8) | src;
4667 xmm_src = _mm_set_epi32 (src, src, src, src);
4672 dst_line += dst_stride;
4675 while (w && ((unsigned long)dst & 15))
4677 *dst = (uint8_t)_mm_cvtsi128_si32 (
4680 _mm_cvtsi32_si128 (*dst)));
4689 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4697 *dst = (uint8_t)_mm_cvtsi128_si32 (
4700 _mm_cvtsi32_si128 (*dst)));
4709 /* ----------------------------------------------------------------------
4714 sse2_composite_add_8_8 (pixman_implementation_t *imp,
4716 pixman_image_t * src_image,
4717 pixman_image_t * mask_image,
4718 pixman_image_t * dst_image,
4728 uint8_t *dst_line, *dst;
4729 uint8_t *src_line, *src;
4730 int dst_stride, src_stride;
4734 PIXMAN_IMAGE_GET_LINE (
4735 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4736 PIXMAN_IMAGE_GET_LINE (
4737 dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4744 dst_line += dst_stride;
4745 src_line += src_stride;
4749 while (w && (unsigned long)dst & 3)
4751 t = (*dst) + (*src++);
4752 *dst++ = t | (0 - (t >> 8));
4756 sse2_combine_add_u (imp, op,
4757 (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4767 t = (*dst) + (*src++);
4768 *dst++ = t | (0 - (t >> 8));
4775 /* ---------------------------------------------------------------------
4776 * composite_add_8888_8888
4779 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4781 pixman_image_t * src_image,
4782 pixman_image_t * mask_image,
4783 pixman_image_t * dst_image,
4793 uint32_t *dst_line, *dst;
4794 uint32_t *src_line, *src;
4795 int dst_stride, src_stride;
4797 PIXMAN_IMAGE_GET_LINE (
4798 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4799 PIXMAN_IMAGE_GET_LINE (
4800 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4805 dst_line += dst_stride;
4807 src_line += src_stride;
4809 sse2_combine_add_u (imp, op, dst, src, NULL, width);
4814 /* -------------------------------------------------------------------------------------------------
4815 * sse2_composite_copy_area
4818 static pixman_bool_t
4819 pixman_blt_sse2 (uint32_t *src_bits,
4832 uint8_t * src_bytes;
4833 uint8_t * dst_bytes;
4836 if (src_bpp != dst_bpp)
4841 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4842 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4843 src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4844 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4845 byte_width = 2 * width;
4849 else if (src_bpp == 32)
4851 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4852 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4853 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4854 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4855 byte_width = 4 * width;
4867 uint8_t *s = src_bytes;
4868 uint8_t *d = dst_bytes;
4869 src_bytes += src_stride;
4870 dst_bytes += dst_stride;
4873 while (w >= 2 && ((unsigned long)d & 3))
4875 *(uint16_t *)d = *(uint16_t *)s;
4881 while (w >= 4 && ((unsigned long)d & 15))
4883 *(uint32_t *)d = *(uint32_t *)s;
4892 __m128i xmm0, xmm1, xmm2, xmm3;
4894 xmm0 = load_128_unaligned ((__m128i*)(s));
4895 xmm1 = load_128_unaligned ((__m128i*)(s + 16));
4896 xmm2 = load_128_unaligned ((__m128i*)(s + 32));
4897 xmm3 = load_128_unaligned ((__m128i*)(s + 48));
4899 save_128_aligned ((__m128i*)(d), xmm0);
4900 save_128_aligned ((__m128i*)(d + 16), xmm1);
4901 save_128_aligned ((__m128i*)(d + 32), xmm2);
4902 save_128_aligned ((__m128i*)(d + 48), xmm3);
4911 save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
4920 *(uint32_t *)d = *(uint32_t *)s;
4929 *(uint16_t *)d = *(uint16_t *)s;
4941 sse2_composite_copy_area (pixman_implementation_t *imp,
4943 pixman_image_t * src_image,
4944 pixman_image_t * mask_image,
4945 pixman_image_t * dst_image,
4955 pixman_blt_sse2 (src_image->bits.bits,
4956 dst_image->bits.bits,
4957 src_image->bits.rowstride,
4958 dst_image->bits.rowstride,
4959 PIXMAN_FORMAT_BPP (src_image->bits.format),
4960 PIXMAN_FORMAT_BPP (dst_image->bits.format),
4961 src_x, src_y, dest_x, dest_y, width, height);
4965 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
4967 pixman_image_t * src_image,
4968 pixman_image_t * mask_image,
4969 pixman_image_t * dst_image,
4979 uint32_t *src, *src_line, s;
4980 uint32_t *dst, *dst_line, d;
4981 uint8_t *mask, *mask_line;
4983 int src_stride, mask_stride, dst_stride;
4987 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4988 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4989 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4991 PIXMAN_IMAGE_GET_LINE (
4992 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4993 PIXMAN_IMAGE_GET_LINE (
4994 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4995 PIXMAN_IMAGE_GET_LINE (
4996 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5001 src_line += src_stride;
5003 dst_line += dst_stride;
5005 mask_line += mask_stride;
5009 while (w && (unsigned long)dst & 15)
5011 s = 0xff000000 | *src++;
5012 m = (uint32_t) *mask++;
5014 ms = unpack_32_1x128 (s);
5018 __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
5019 __m128i md = unpack_32_1x128 (d);
5021 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
5024 *dst++ = pack_1x128_32 (ms);
5030 m = *(uint32_t*) mask;
5031 xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
5033 if (m == 0xffffffff)
5035 save_128_aligned ((__m128i*)dst, xmm_src);
5039 xmm_dst = load_128_aligned ((__m128i*)dst);
5041 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5043 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5044 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5045 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5047 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5049 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5051 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5062 m = (uint32_t) *mask++;
5066 s = 0xff000000 | *src;
5078 ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
5079 md = unpack_32_1x128 (d);
5080 ms = unpack_32_1x128 (s);
5082 *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
5096 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
5098 pixman_image_t * src_image,
5099 pixman_image_t * mask_image,
5100 pixman_image_t * dst_image,
5110 uint32_t *src, *src_line, s;
5111 uint32_t *dst, *dst_line, d;
5112 uint8_t *mask, *mask_line;
5114 int src_stride, mask_stride, dst_stride;
5117 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5118 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5119 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5121 PIXMAN_IMAGE_GET_LINE (
5122 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5123 PIXMAN_IMAGE_GET_LINE (
5124 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
5125 PIXMAN_IMAGE_GET_LINE (
5126 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5131 src_line += src_stride;
5133 dst_line += dst_stride;
5135 mask_line += mask_stride;
5139 while (w && (unsigned long)dst & 15)
5144 m = (uint32_t) *mask++;
5151 if (sa == 0xff && m == 0xff)
5157 __m128i ms, md, ma, msa;
5159 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5160 ms = unpack_32_1x128 (s);
5161 md = unpack_32_1x128 (d);
5163 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5165 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5175 m = *(uint32_t *) mask;
5179 xmm_src = load_128_unaligned ((__m128i*)src);
5181 if (m == 0xffffffff && is_opaque (xmm_src))
5183 save_128_aligned ((__m128i *)dst, xmm_src);
5187 xmm_dst = load_128_aligned ((__m128i *)dst);
5189 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5191 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5192 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5193 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5195 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5196 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5198 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5199 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5201 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5216 m = (uint32_t) *mask++;
5223 if (sa == 0xff && m == 0xff)
5229 __m128i ms, md, ma, msa;
5231 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5232 ms = unpack_32_1x128 (s);
5233 md = unpack_32_1x128 (d);
5235 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5237 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5249 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5251 pixman_image_t * src_image,
5252 pixman_image_t * mask_image,
5253 pixman_image_t * dst_image,
5264 uint32_t *dst_line, *dst;
5266 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5267 __m128i xmm_dsta_hi, xmm_dsta_lo;
5271 src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
5276 PIXMAN_IMAGE_GET_LINE (
5277 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5279 xmm_src = expand_pixel_32_1x128 (src);
5285 dst_line += dst_stride;
5288 while (w && (unsigned long)dst & 15)
5292 vd = unpack_32_1x128 (*dst);
5294 *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5302 __m128i tmp_lo, tmp_hi;
5304 xmm_dst = load_128_aligned ((__m128i*)dst);
5306 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5307 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5312 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5313 &xmm_dsta_lo, &xmm_dsta_hi,
5317 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5327 vd = unpack_32_1x128 (*dst);
5329 *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5340 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5342 pixman_image_t * src_image,
5343 pixman_image_t * mask_image,
5344 pixman_image_t * dst_image,
5354 uint32_t *src, *src_line, s;
5355 uint32_t *dst, *dst_line, d;
5356 uint32_t *mask, *mask_line;
5358 int src_stride, mask_stride, dst_stride;
5361 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5362 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5363 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5365 PIXMAN_IMAGE_GET_LINE (
5366 dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5367 PIXMAN_IMAGE_GET_LINE (
5368 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5369 PIXMAN_IMAGE_GET_LINE (
5370 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5375 src_line += src_stride;
5377 dst_line += dst_stride;
5379 mask_line += mask_stride;
5383 while (w && (unsigned long)dst & 15)
5388 m = (*mask++) >> 24;
5395 if (sa == 0xff && m == 0xff)
5401 __m128i ms, md, ma, msa;
5403 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5404 ms = unpack_32_1x128 (s);
5405 md = unpack_32_1x128 (d);
5407 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5409 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5419 xmm_mask = load_128_unaligned ((__m128i*)mask);
5421 if (!is_transparent (xmm_mask))
5423 xmm_src = load_128_unaligned ((__m128i*)src);
5425 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5427 save_128_aligned ((__m128i *)dst, xmm_src);
5431 xmm_dst = load_128_aligned ((__m128i *)dst);
5433 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5434 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5435 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5437 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5438 expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5440 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5441 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5443 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5458 m = (*mask++) >> 24;
5465 if (sa == 0xff && m == 0xff)
5471 __m128i ms, md, ma, msa;
5473 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5474 ms = unpack_32_1x128 (s);
5475 md = unpack_32_1x128 (d);
5477 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5479 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5490 /* A variant of 'sse2_combine_over_u' with minor tweaks */
5491 static force_inline void
5492 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
5496 pixman_fixed_t unit_x,
5497 pixman_fixed_t max_vx,
5498 pixman_bool_t fully_transparent_src)
5501 const uint32_t* pm = NULL;
5503 __m128i xmm_dst_lo, xmm_dst_hi;
5504 __m128i xmm_src_lo, xmm_src_hi;
5505 __m128i xmm_alpha_lo, xmm_alpha_hi;
5507 if (fully_transparent_src)
5510 /* Align dst on a 16-byte boundary */
5511 while (w && ((unsigned long)pd & 15))
5514 s = combine1 (ps + (vx >> 16), pm);
5517 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5526 uint32_t tmp1, tmp2, tmp3, tmp4;
5528 tmp1 = ps[vx >> 16];
5530 tmp2 = ps[vx >> 16];
5532 tmp3 = ps[vx >> 16];
5534 tmp4 = ps[vx >> 16];
5537 tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5539 xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5541 if (is_opaque (xmm_src_hi))
5543 save_128_aligned ((__m128i*)pd, xmm_src_hi);
5545 else if (!is_zero (xmm_src_hi))
5547 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5549 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5550 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5552 expand_alpha_2x128 (
5553 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5555 over_2x128 (&xmm_src_lo, &xmm_src_hi,
5556 &xmm_alpha_lo, &xmm_alpha_hi,
5557 &xmm_dst_lo, &xmm_dst_hi);
5559 /* rebuid the 4 pixel data and save*/
5560 save_128_aligned ((__m128i*)pd,
5561 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5573 s = combine1 (ps + (vx >> 16), pm);
5576 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5584 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5585 scaled_nearest_scanline_sse2_8888_8888_OVER,
5586 uint32_t, uint32_t, COVER)
5587 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5588 scaled_nearest_scanline_sse2_8888_8888_OVER,
5589 uint32_t, uint32_t, NONE)
5590 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5591 scaled_nearest_scanline_sse2_8888_8888_OVER,
5592 uint32_t, uint32_t, PAD)
5594 static force_inline void
5595 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
5597 const uint32_t * src,
5600 pixman_fixed_t unit_x,
5601 pixman_fixed_t max_vx,
5602 pixman_bool_t zero_src)
5605 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5606 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5607 __m128i xmm_alpha_lo, xmm_alpha_hi;
5609 if (zero_src || (*mask >> 24) == 0)
5612 xmm_mask = create_mask_16_128 (*mask >> 24);
5614 while (w && (unsigned long)dst & 15)
5616 uint32_t s = src[pixman_fixed_to_int (vx)];
5623 __m128i ms = unpack_32_1x128 (s);
5624 __m128i alpha = expand_alpha_1x128 (ms);
5625 __m128i dest = xmm_mask;
5626 __m128i alpha_dst = unpack_32_1x128 (d);
5628 *dst = pack_1x128_32 (
5629 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
5637 uint32_t tmp1, tmp2, tmp3, tmp4;
5639 tmp1 = src[pixman_fixed_to_int (vx)];
5641 tmp2 = src[pixman_fixed_to_int (vx)];
5643 tmp3 = src[pixman_fixed_to_int (vx)];
5645 tmp4 = src[pixman_fixed_to_int (vx)];
5648 xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5650 if (!is_zero (xmm_src))
5652 xmm_dst = load_128_aligned ((__m128i*)dst);
5654 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5655 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5656 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
5657 &xmm_alpha_lo, &xmm_alpha_hi);
5659 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
5660 &xmm_alpha_lo, &xmm_alpha_hi,
5661 &xmm_mask, &xmm_mask,
5662 &xmm_dst_lo, &xmm_dst_hi);
5665 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5674 uint32_t s = src[pixman_fixed_to_int (vx)];
5681 __m128i ms = unpack_32_1x128 (s);
5682 __m128i alpha = expand_alpha_1x128 (ms);
5683 __m128i mask = xmm_mask;
5684 __m128i dest = unpack_32_1x128 (d);
5686 *dst = pack_1x128_32 (
5687 in_over_1x128 (&ms, &alpha, &mask, &dest));
5696 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
5697 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5698 uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
5699 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
5700 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5701 uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
5702 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
5703 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5704 uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
5706 static const pixman_fast_path_t sse2_fast_paths[] =
5708 /* PIXMAN_OP_OVER */
5709 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
5710 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
5711 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
5712 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
5713 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
5714 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
5715 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
5716 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
5717 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
5718 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
5719 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
5720 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
5721 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
5722 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
5723 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
5724 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
5725 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
5726 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
5727 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
5728 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
5729 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
5730 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
5731 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
5732 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
5733 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
5734 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
5735 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
5736 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
5737 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
5738 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
5739 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
5740 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
5741 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5742 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
5743 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5744 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
5745 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
5746 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
5747 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
5748 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
5749 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
5750 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
5751 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
5752 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
5753 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5754 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5756 /* PIXMAN_OP_OVER_REVERSE */
5757 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
5758 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
5761 PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
5762 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
5763 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
5764 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
5765 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
5766 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
5769 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
5770 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
5771 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
5772 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
5773 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
5774 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
5775 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
5776 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
5777 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5778 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5779 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
5780 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
5781 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
5782 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
5785 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
5786 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
5787 PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
5789 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5790 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5791 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5792 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5793 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5794 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5795 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5796 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5797 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
5798 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
5799 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
5800 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
5802 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
5803 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
5804 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
5805 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
5810 static pixman_bool_t
5811 sse2_blt (pixman_implementation_t *imp,
5812 uint32_t * src_bits,
5813 uint32_t * dst_bits,
5825 if (!pixman_blt_sse2 (
5826 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5827 src_x, src_y, dst_x, dst_y, width, height))
5830 return _pixman_implementation_blt (
5832 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5833 src_x, src_y, dst_x, dst_y, width, height);
5839 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5840 __attribute__((__force_align_arg_pointer__))
5842 static pixman_bool_t
5843 sse2_fill (pixman_implementation_t *imp,
5853 if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
5855 return _pixman_implementation_fill (
5856 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5863 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
5865 int w = iter->width;
5866 __m128i ff000000 = mask_ff000000;
5867 uint32_t *dst = iter->buffer;
5868 uint32_t *src = (uint32_t *)iter->bits;
5870 iter->bits += iter->stride;
5872 while (w && ((unsigned long)dst) & 0x0f)
5874 *dst++ = (*src++) | 0xff000000;
5881 (__m128i *)dst, _mm_or_si128 (
5882 load_128_unaligned ((__m128i *)src), ff000000));
5891 *dst++ = (*src++) | 0xff000000;
5895 return iter->buffer;
5899 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
5901 int w = iter->width;
5902 uint32_t *dst = iter->buffer;
5903 uint16_t *src = (uint16_t *)iter->bits;
5904 __m128i ff000000 = mask_ff000000;
5906 iter->bits += iter->stride;
5908 while (w && ((unsigned long)dst) & 0x0f)
5910 uint16_t s = *src++;
5912 *dst++ = CONVERT_0565_TO_8888 (s);
5920 s = _mm_loadu_si128 ((__m128i *)src);
5922 lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
5923 hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
5925 save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
5926 save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
5935 uint16_t s = *src++;
5937 *dst++ = CONVERT_0565_TO_8888 (s);
5941 return iter->buffer;
5945 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
5947 int w = iter->width;
5948 uint32_t *dst = iter->buffer;
5949 uint8_t *src = iter->bits;
5950 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5952 iter->bits += iter->stride;
5954 while (w && (((unsigned long)dst) & 15))
5956 *dst++ = *(src++) << 24;
5962 xmm0 = _mm_loadu_si128((__m128i *)src);
5964 xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0);
5965 xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0);
5966 xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
5967 xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
5968 xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
5969 xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
5971 _mm_store_si128(((__m128i *)(dst + 0)), xmm3);
5972 _mm_store_si128(((__m128i *)(dst + 4)), xmm4);
5973 _mm_store_si128(((__m128i *)(dst + 8)), xmm5);
5974 _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
5983 *dst++ = *(src++) << 24;
5987 return iter->buffer;
5992 pixman_format_code_t format;
5993 pixman_iter_get_scanline_t get_scanline;
5996 static const fetcher_info_t fetchers[] =
5998 { PIXMAN_x8r8g8b8, sse2_fetch_x8r8g8b8 },
5999 { PIXMAN_r5g6b5, sse2_fetch_r5g6b5 },
6000 { PIXMAN_a8, sse2_fetch_a8 },
6005 sse2_src_iter_init (pixman_implementation_t *imp,
6006 pixman_iter_t *iter,
6007 pixman_image_t *image,
6008 int x, int y, int width, int height,
6009 uint8_t *buffer, iter_flags_t flags)
6012 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
6014 if ((flags & ITER_NARROW) &&
6015 (image->common.flags & FLAGS) == FLAGS &&
6017 x + width <= image->bits.width &&
6018 y + height <= image->bits.height)
6020 const fetcher_info_t *f;
6022 for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
6024 if (image->common.extended_format_code == f->format)
6026 uint8_t *b = (uint8_t *)image->bits.bits;
6027 int s = image->bits.rowstride * 4;
6029 iter->bits = b + s * y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
6031 iter->width = width;
6032 iter->buffer = (uint32_t *)buffer;
6034 iter->get_scanline = f->get_scanline;
6040 _pixman_implementation_src_iter_init (
6041 imp->delegate, iter, image, x, y, width, height, buffer, flags);
6044 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6045 __attribute__((__force_align_arg_pointer__))
6047 pixman_implementation_t *
6048 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
6050 pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6052 /* SSE2 constants */
6053 mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6054 mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6055 mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6056 mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6057 mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6058 mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6059 mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6060 mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6061 mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
6062 mask_0080 = create_mask_16_128 (0x0080);
6063 mask_00ff = create_mask_16_128 (0x00ff);
6064 mask_0101 = create_mask_16_128 (0x0101);
6065 mask_ffff = create_mask_16_128 (0xffff);
6066 mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6067 mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6070 /* Set up function pointers */
6072 /* SSE code patch for fbcompose.c */
6073 imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6074 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6075 imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6076 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6077 imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6078 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6079 imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6080 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6081 imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6082 imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6084 imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6086 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6087 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6088 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6089 imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6090 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6091 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6092 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6093 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6094 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6095 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6096 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6098 imp->blt = sse2_blt;
6099 imp->fill = sse2_fill;
6101 imp->src_iter_init = sse2_src_iter_init;
6106 #endif /* USE_SSE2 */