X-Git-Url: http://review.tizen.org/git/?a=blobdiff_plain;f=pixman%2Fpixman-sse2.c;h=e217ca339a88f285039fcac801ccf2df459c9b48;hb=17acc7a4c707db4804b6bf47db30883745049fdb;hp=c81836ab99261abb2322d58cc1ad6ae2f86c3cd2;hpb=04ade7b68c620a62daff6212eee4d1b96bfbc3c9;p=profile%2Fivi%2Fpixman.git diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c index c81836a..e217ca3 100644 --- a/pixman/pixman-sse2.c +++ b/pixman/pixman-sse2.c @@ -30,34 +30,11 @@ #include #endif -#include #include /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ #include /* for SSE2 intrinsics */ #include "pixman-private.h" #include "pixman-combine32.h" - -#if defined(_MSC_VER) && defined(_M_AMD64) -/* Windows 64 doesn't allow MMX to be used, so - * the pixman-x64-mmx-emulation.h file contains - * implementations of those MMX intrinsics that - * are used in the SSE2 implementation. - */ -# include "pixman-x64-mmx-emulation.h" -#endif - -#ifdef USE_SSE2 - -/* -------------------------------------------------------------------- - * Locals - */ - -static __m64 mask_x0080; -static __m64 mask_x00ff; -static __m64 mask_x0101; -static __m64 mask_x_alpha; - -static __m64 mask_x565_rgb; -static __m64 mask_x565_unpack; +#include "pixman-inlines.h" static __m128i mask_0080; static __m128i mask_00ff; @@ -76,9 +53,6 @@ static __m128i mask_blue; static __m128i mask_565_fix_rb; static __m128i mask_565_fix_g; -/* ---------------------------------------------------------------------- - * SSE2 Inlines - */ static force_inline __m128i unpack_32_1x128 (uint32_t data) { @@ -356,18 +330,6 @@ in_over_2x128 (__m128i* src_lo, over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi); } -static force_inline void -cache_prefetch (__m128i* addr) -{ - _mm_prefetch ((void const*)addr, _MM_HINT_T0); -} - -static force_inline void -cache_prefetch_next (__m128i* addr) -{ - _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */ -} - /* load 4 pixels from a 16-byte boundary aligned address */ static force_inline __m128i load_128_aligned (__m128i* src) @@ -408,140 +370,104 @@ save_128_unaligned (__m128i* dst, _mm_storeu_si128 (dst, data); } -/* ------------------------------------------------------------------ - * MMX inlines - */ - -static force_inline __m64 -unpack_32_1x64 (uint32_t data) -{ - return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64 ()); -} - -static force_inline __m64 -expand_alpha_1x64 (__m64 data) +static force_inline __m128i +load_32_1x128 (uint32_t data) { - return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3)); + return _mm_cvtsi32_si128 (data); } -static force_inline __m64 -expand_alpha_rev_1x64 (__m64 data) +static force_inline __m128i +expand_alpha_rev_1x128 (__m128i data) { - return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0)); + return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0)); } -static force_inline __m64 -expand_pixel_8_1x64 (uint8_t data) +static force_inline __m128i +expand_pixel_8_1x128 (uint8_t data) { - return _mm_shuffle_pi16 ( - unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0)); + return _mm_shufflelo_epi16 ( + unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0)); } -static force_inline __m64 -pix_multiply_1x64 (__m64 data, - __m64 alpha) +static force_inline __m128i +pix_multiply_1x128 (__m128i data, + __m128i alpha) { - return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha), - mask_x0080), - mask_x0101); + return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha), + mask_0080), + mask_0101); } -static force_inline __m64 -pix_add_multiply_1x64 (__m64* src, - __m64* alpha_dst, - __m64* dst, - __m64* alpha_src) +static force_inline __m128i +pix_add_multiply_1x128 (__m128i* src, + __m128i* alpha_dst, + __m128i* dst, + __m128i* alpha_src) { - __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst); - __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src); + __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst); + __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src); - return _mm_adds_pu8 (t1, t2); + return _mm_adds_epu8 (t1, t2); } -static force_inline __m64 -negate_1x64 (__m64 data) +static force_inline __m128i +negate_1x128 (__m128i data) { - return _mm_xor_si64 (data, mask_x00ff); + return _mm_xor_si128 (data, mask_00ff); } -static force_inline __m64 -invert_colors_1x64 (__m64 data) +static force_inline __m128i +invert_colors_1x128 (__m128i data) { - return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2)); + return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2)); } -static force_inline __m64 -over_1x64 (__m64 src, __m64 alpha, __m64 dst) +static force_inline __m128i +over_1x128 (__m128i src, __m128i alpha, __m128i dst) { - return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha))); + return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha))); } -static force_inline __m64 -in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst) +static force_inline __m128i +in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst) { - return over_1x64 (pix_multiply_1x64 (*src, *mask), - pix_multiply_1x64 (*alpha, *mask), - *dst); + return over_1x128 (pix_multiply_1x128 (*src, *mask), + pix_multiply_1x128 (*alpha, *mask), + *dst); } -static force_inline __m64 -over_rev_non_pre_1x64 (__m64 src, __m64 dst) +static force_inline __m128i +over_rev_non_pre_1x128 (__m128i src, __m128i dst) { - __m64 alpha = expand_alpha_1x64 (src); + __m128i alpha = expand_alpha_1x128 (src); - return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src), - _mm_or_si64 (alpha, mask_x_alpha)), - alpha, - dst); + return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src), + _mm_or_si128 (alpha, mask_alpha)), + alpha, + dst); } static force_inline uint32_t -pack_1x64_32 (__m64 data) +pack_1x128_32 (__m128i data) { - return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ())); + return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ())); } -/* Expand 16 bits positioned at @pos (0-3) of a mmx register into - * - * 00RR00GG00BB - * - * --- Expanding 565 in the low word --- - * - * m = (m << (32 - 3)) | (m << (16 - 5)) | m; - * m = m & (01f0003f001f); - * m = m * (008404100840); - * m = m >> 8; - * - * Note the trick here - the top word is shifted by another nibble to - * avoid it bumping into the middle word - */ -static force_inline __m64 -expand565_16_1x64 (uint16_t pixel) +static force_inline __m128i +expand565_16_1x128 (uint16_t pixel) { - __m64 p; - __m64 t1, t2; - - p = _mm_cvtsi32_si64 ((uint32_t) pixel); - - t1 = _mm_slli_si64 (p, 36 - 11); - t2 = _mm_slli_si64 (p, 16 - 5); + __m128i m = _mm_cvtsi32_si128 (pixel); - p = _mm_or_si64 (t1, p); - p = _mm_or_si64 (t2, p); - p = _mm_and_si64 (p, mask_x565_rgb); - p = _mm_mullo_pi16 (p, mask_x565_unpack); + m = unpack_565_to_8888 (m); - return _mm_srli_pi16 (p, 8); + return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ()); } -/* ---------------------------------------------------------------------------- - * Compose Core transformations - */ static force_inline uint32_t core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst) { uint8_t a; - __m64 ms; + __m128i xmms; a = src >> 24; @@ -551,9 +477,10 @@ core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst) } else if (src) { - ms = unpack_32_1x64 (src); - return pack_1x64_32 ( - over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst))); + xmms = unpack_32_1x128 (src); + return pack_1x128_32 ( + over_1x128 (xmms, expand_alpha_1x128 (xmms), + unpack_32_1x128 (dst))); } return dst; @@ -566,15 +493,15 @@ combine1 (const uint32_t *ps, const uint32_t *pm) if (pm) { - __m64 ms, mm; + __m128i ms, mm; - mm = unpack_32_1x64 (*pm); - mm = expand_alpha_1x64 (mm); + mm = unpack_32_1x128 (*pm); + mm = expand_alpha_1x128 (mm); - ms = unpack_32_1x64 (s); - ms = pix_multiply_1x64 (ms, mm); + ms = unpack_32_1x128 (s); + ms = pix_multiply_1x128 (ms, mm); - s = pack_1x64_32 (ms); + s = pack_1x128_32 (ms); } return s; @@ -615,101 +542,182 @@ combine4 (const __m128i *ps, const __m128i *pm) } static force_inline void -core_combine_over_u_sse2 (uint32_t* pd, - const uint32_t* ps, - const uint32_t* pm, - int w) +core_combine_over_u_sse2_mask (uint32_t * pd, + const uint32_t* ps, + const uint32_t* pm, + int w) { uint32_t s, d; - __m128i xmm_dst_lo, xmm_dst_hi; - __m128i xmm_src_lo, xmm_src_hi; - __m128i xmm_alpha_lo, xmm_alpha_hi; - - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - /* Align dst on a 16-byte boundary */ while (w && ((unsigned long)pd & 15)) { d = *pd; s = combine1 (ps, pm); - *pd++ = core_combine_over_u_pixel_sse2 (s, d); + if (s) + *pd = core_combine_over_u_pixel_sse2 (s, d); + pd++; ps++; - if (pm) - pm++; + pm++; w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - - /* I'm loading unaligned because I'm not sure about - * the address alignment. - */ - xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); + __m128i mask = load_128_unaligned ((__m128i *)pm); - if (is_opaque (xmm_src_hi)) - { - save_128_aligned ((__m128i*)pd, xmm_src_hi); - } - else if (!is_zero (xmm_src_hi)) + if (!is_zero (mask)) { - xmm_dst_hi = load_128_aligned ((__m128i*) pd); + __m128i src; + __m128i src_hi, src_lo; + __m128i mask_hi, mask_lo; + __m128i alpha_hi, alpha_lo; - unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + src = load_128_unaligned ((__m128i *)ps); - expand_alpha_2x128 ( - xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); + if (is_opaque (_mm_and_si128 (src, mask))) + { + save_128_aligned ((__m128i *)pd, src); + } + else + { + __m128i dst = load_128_aligned ((__m128i *)pd); + __m128i dst_hi, dst_lo; - over_2x128 (&xmm_src_lo, &xmm_src_hi, - &xmm_alpha_lo, &xmm_alpha_hi, - &xmm_dst_lo, &xmm_dst_hi); + unpack_128_2x128 (mask, &mask_lo, &mask_hi); + unpack_128_2x128 (src, &src_lo, &src_hi); - /* rebuid the 4 pixel data and save*/ - save_128_aligned ((__m128i*)pd, - pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi); + pix_multiply_2x128 (&src_lo, &src_hi, + &mask_lo, &mask_hi, + &src_lo, &src_hi); + + unpack_128_2x128 (dst, &dst_lo, &dst_hi); + + expand_alpha_2x128 (src_lo, src_hi, + &alpha_lo, &alpha_hi); + + over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi, + &dst_lo, &dst_hi); + + save_128_aligned ( + (__m128i *)pd, + pack_2x128_128 (dst_lo, dst_hi)); + } } - w -= 4; + pm += 4; ps += 4; pd += 4; - if (pm) - pm += 4; + w -= 4; } - while (w) { d = *pd; s = combine1 (ps, pm); - *pd++ = core_combine_over_u_pixel_sse2 (s, d); + if (s) + *pd = core_combine_over_u_pixel_sse2 (s, d); + pd++; + ps++; + pm++; + + w--; + } +} + +static force_inline void +core_combine_over_u_sse2_no_mask (uint32_t * pd, + const uint32_t* ps, + int w) +{ + uint32_t s, d; + + /* Align dst on a 16-byte boundary */ + while (w && ((unsigned long)pd & 15)) + { + d = *pd; + s = *ps; + + if (s) + *pd = core_combine_over_u_pixel_sse2 (s, d); + pd++; + ps++; + w--; + } + + while (w >= 4) + { + __m128i src; + __m128i src_hi, src_lo, dst_hi, dst_lo; + __m128i alpha_hi, alpha_lo; + + src = load_128_unaligned ((__m128i *)ps); + + if (!is_zero (src)) + { + if (is_opaque (src)) + { + save_128_aligned ((__m128i *)pd, src); + } + else + { + __m128i dst = load_128_aligned ((__m128i *)pd); + + unpack_128_2x128 (src, &src_lo, &src_hi); + unpack_128_2x128 (dst, &dst_lo, &dst_hi); + + expand_alpha_2x128 (src_lo, src_hi, + &alpha_lo, &alpha_hi); + over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi, + &dst_lo, &dst_hi); + + save_128_aligned ( + (__m128i *)pd, + pack_2x128_128 (dst_lo, dst_hi)); + } + } + + ps += 4; + pd += 4; + w -= 4; + } + while (w) + { + d = *pd; + s = *ps; + + if (s) + *pd = core_combine_over_u_pixel_sse2 (s, d); + pd++; ps++; - if (pm) - pm++; w--; } } static force_inline void -core_combine_over_reverse_u_sse2 (uint32_t* pd, - const uint32_t* ps, - const uint32_t* pm, - int w) +sse2_combine_over_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) +{ + if (pm) + core_combine_over_u_sse2_mask (pd, ps, pm, w); + else + core_combine_over_u_sse2_no_mask (pd, ps, w); +} + +static void +sse2_combine_over_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, d; @@ -717,11 +725,6 @@ core_combine_over_reverse_u_sse2 (uint32_t* pd, __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_alpha_lo, xmm_alpha_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - /* Align dst on a 16-byte boundary */ while (w && ((unsigned long)pd & 15)) @@ -736,18 +739,8 @@ core_combine_over_reverse_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - /* I'm loading unaligned because I'm not sure * about the address alignment. */ @@ -790,7 +783,7 @@ core_combine_over_reverse_u_sse2 (uint32_t* pd, } static force_inline uint32_t -core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst) +core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst) { uint32_t maska = src >> 24; @@ -800,54 +793,41 @@ core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst) } else if (maska != 0xff) { - return pack_1x64_32 ( - pix_multiply_1x64 (unpack_32_1x64 (dst), - expand_alpha_1x64 (unpack_32_1x64 (src)))); + return pack_1x128_32 ( + pix_multiply_1x128 (unpack_32_1x128 (dst), + expand_alpha_1x128 (unpack_32_1x128 (src)))); } return dst; } -static force_inline void -core_combine_in_u_sse2 (uint32_t* pd, - const uint32_t* ps, - const uint32_t* pm, - int w) +static void +sse2_combine_in_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, d; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); d = *pd; - *pd++ = core_combine_in_u_pixelsse2 (d, s); + *pd++ = core_combine_in_u_pixel_sse2 (d, s); w--; ps++; if (pm) pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*) pd); xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm); @@ -874,7 +854,7 @@ core_combine_in_u_sse2 (uint32_t* pd, s = combine1 (ps, pm); d = *pd; - *pd++ = core_combine_in_u_pixelsse2 (d, s); + *pd++ = core_combine_in_u_pixel_sse2 (d, s); w--; ps++; if (pm) @@ -882,46 +862,33 @@ core_combine_in_u_sse2 (uint32_t* pd, } } -static force_inline void -core_combine_reverse_in_u_sse2 (uint32_t* pd, - const uint32_t* ps, - const uint32_t *pm, - int w) +static void +sse2_combine_in_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, d; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); d = *pd; - *pd++ = core_combine_in_u_pixelsse2 (s, d); + *pd++ = core_combine_in_u_pixel_sse2 (s, d); ps++; w--; if (pm) pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*) pd); xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); @@ -948,7 +915,7 @@ core_combine_reverse_in_u_sse2 (uint32_t* pd, s = combine1 (ps, pm); d = *pd; - *pd++ = core_combine_in_u_pixelsse2 (s, d); + *pd++ = core_combine_in_u_pixel_sse2 (s, d); w--; ps++; if (pm) @@ -956,48 +923,35 @@ core_combine_reverse_in_u_sse2 (uint32_t* pd, } } -static force_inline void -core_combine_reverse_out_u_sse2 (uint32_t* pd, - const uint32_t* ps, - const uint32_t* pm, - int w) +static void +sse2_combine_out_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { uint32_t s = combine1 (ps, pm); uint32_t d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (d), negate_1x64 ( - expand_alpha_1x64 (unpack_32_1x64 (s))))); - + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (d), negate_1x128 ( + expand_alpha_1x128 (unpack_32_1x128 (s))))); + if (pm) pm++; ps++; w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1027,10 +981,10 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd, uint32_t s = combine1 (ps, pm); uint32_t d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (d), negate_1x64 ( - expand_alpha_1x64 (unpack_32_1x64 (s))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (d), negate_1x128 ( + expand_alpha_1x128 (unpack_32_1x128 (s))))); ps++; if (pm) pm++; @@ -1038,47 +992,34 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd, } } -static force_inline void -core_combine_out_u_sse2 (uint32_t* pd, - const uint32_t* ps, - const uint32_t* pm, - int w) +static void +sse2_combine_out_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { uint32_t s = combine1 (ps, pm); uint32_t d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (s), negate_1x64 ( - expand_alpha_1x64 (unpack_32_1x64 (d))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (s), negate_1x128 ( + expand_alpha_1x128 (unpack_32_1x128 (d))))); w--; ps++; if (pm) pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1107,10 +1048,10 @@ core_combine_out_u_sse2 (uint32_t* pd, uint32_t s = combine1 (ps, pm); uint32_t d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (s), negate_1x64 ( - expand_alpha_1x64 (unpack_32_1x64 (d))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (s), negate_1x128 ( + expand_alpha_1x128 (unpack_32_1x128 (d))))); w--; ps++; if (pm) @@ -1122,20 +1063,22 @@ static force_inline uint32_t core_combine_atop_u_pixel_sse2 (uint32_t src, uint32_t dst) { - __m64 s = unpack_32_1x64 (src); - __m64 d = unpack_32_1x64 (dst); + __m128i s = unpack_32_1x128 (src); + __m128i d = unpack_32_1x128 (dst); - __m64 sa = negate_1x64 (expand_alpha_1x64 (s)); - __m64 da = expand_alpha_1x64 (d); + __m128i sa = negate_1x128 (expand_alpha_1x128 (s)); + __m128i da = expand_alpha_1x128 (d); - return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa)); + return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa)); } -static force_inline void -core_combine_atop_u_sse2 (uint32_t* pd, - const uint32_t* ps, - const uint32_t* pm, - int w) +static void +sse2_combine_atop_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, d; @@ -1144,11 +1087,6 @@ core_combine_atop_u_sse2 (uint32_t* pd, __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); @@ -1161,18 +1099,8 @@ core_combine_atop_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1219,20 +1147,22 @@ static force_inline uint32_t core_combine_reverse_atop_u_pixel_sse2 (uint32_t src, uint32_t dst) { - __m64 s = unpack_32_1x64 (src); - __m64 d = unpack_32_1x64 (dst); + __m128i s = unpack_32_1x128 (src); + __m128i d = unpack_32_1x128 (dst); - __m64 sa = expand_alpha_1x64 (s); - __m64 da = negate_1x64 (expand_alpha_1x64 (d)); + __m128i sa = expand_alpha_1x128 (s); + __m128i da = negate_1x128 (expand_alpha_1x128 (d)); - return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa)); + return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa)); } -static force_inline void -core_combine_reverse_atop_u_sse2 (uint32_t* pd, - const uint32_t* ps, - const uint32_t* pm, - int w) +static void +sse2_combine_atop_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, d; @@ -1241,11 +1171,6 @@ core_combine_reverse_atop_u_sse2 (uint32_t* pd, __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); @@ -1258,18 +1183,8 @@ core_combine_reverse_atop_u_sse2 (uint32_t* pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); @@ -1316,20 +1231,22 @@ static force_inline uint32_t core_combine_xor_u_pixel_sse2 (uint32_t src, uint32_t dst) { - __m64 s = unpack_32_1x64 (src); - __m64 d = unpack_32_1x64 (dst); + __m128i s = unpack_32_1x128 (src); + __m128i d = unpack_32_1x128 (dst); - __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d)); - __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s)); + __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d)); + __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s)); - return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s)); + return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s)); } -static force_inline void -core_combine_xor_u_sse2 (uint32_t* dst, - const uint32_t* src, - const uint32_t *mask, - int width) +static void +sse2_combine_xor_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) { int w = width; uint32_t s, d; @@ -1342,11 +1259,6 @@ core_combine_xor_u_sse2 (uint32_t* dst, __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); @@ -1359,18 +1271,8 @@ core_combine_xor_u_sse2 (uint32_t* dst, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm); xmm_dst = load_128_aligned ((__m128i*) pd); @@ -1416,10 +1318,12 @@ core_combine_xor_u_sse2 (uint32_t* dst, } static force_inline void -core_combine_add_u_sse2 (uint32_t* dst, - const uint32_t* src, - const uint32_t* mask, - int width) +sse2_combine_add_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dst, + const uint32_t * src, + const uint32_t * mask, + int width) { int w = width; uint32_t s, d; @@ -1427,11 +1331,6 @@ core_combine_add_u_sse2 (uint32_t* dst, const uint32_t* ps = src; const uint32_t* pm = mask; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = combine1 (ps, pm); @@ -1440,25 +1339,15 @@ core_combine_add_u_sse2 (uint32_t* dst, ps++; if (pm) pm++; - *pd++ = _mm_cvtsi64_si32 ( - _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d))); + *pd++ = _mm_cvtsi128_si32 ( + _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d))); w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { __m128i s; - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - s = combine4 ((__m128i*)ps, (__m128i*)pm); save_128_aligned ( @@ -1477,8 +1366,8 @@ core_combine_add_u_sse2 (uint32_t* dst, d = *pd; ps++; - *pd++ = _mm_cvtsi64_si32 ( - _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d))); + *pd++ = _mm_cvtsi128_si32 ( + _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d))); if (pm) pm++; } @@ -1488,36 +1377,33 @@ static force_inline uint32_t core_combine_saturate_u_pixel_sse2 (uint32_t src, uint32_t dst) { - __m64 ms = unpack_32_1x64 (src); - __m64 md = unpack_32_1x64 (dst); + __m128i ms = unpack_32_1x128 (src); + __m128i md = unpack_32_1x128 (dst); uint32_t sa = src >> 24; uint32_t da = ~dst >> 24; if (sa > da) { - ms = pix_multiply_1x64 ( - ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24))); + ms = pix_multiply_1x128 ( + ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24))); } - return pack_1x64_32 (_mm_adds_pu16 (md, ms)); + return pack_1x128_32 (_mm_adds_epu16 (md, ms)); } -static force_inline void -core_combine_saturate_u_sse2 (uint32_t * pd, - const uint32_t *ps, - const uint32_t *pm, - int w) +static void +sse2_combine_saturate_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, d; uint32_t pack_cmp; __m128i xmm_src, xmm_dst; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = combine1 (ps, pm); @@ -1530,18 +1416,8 @@ core_combine_saturate_u_sse2 (uint32_t * pd, pm++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst = load_128_aligned ((__m128i*)pd); xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm); @@ -1602,11 +1478,13 @@ core_combine_saturate_u_sse2 (uint32_t * pd, } } -static force_inline void -core_combine_src_ca_sse2 (uint32_t* pd, - const uint32_t* ps, - const uint32_t *pm, - int w) +static void +sse2_combine_src_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, m; @@ -1614,32 +1492,17 @@ core_combine_src_ca_sse2 (uint32_t* pd, __m128i xmm_mask_lo, xmm_mask_hi; __m128i xmm_dst_lo, xmm_dst_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; m = *pm++; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -1663,8 +1526,8 @@ core_combine_src_ca_sse2 (uint32_t* pd, { s = *ps++; m = *pm++; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); w--; } } @@ -1674,19 +1537,21 @@ core_combine_over_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst) { - __m64 s = unpack_32_1x64 (src); - __m64 expAlpha = expand_alpha_1x64 (s); - __m64 unpk_mask = unpack_32_1x64 (mask); - __m64 unpk_dst = unpack_32_1x64 (dst); + __m128i s = unpack_32_1x128 (src); + __m128i expAlpha = expand_alpha_1x128 (s); + __m128i unpk_mask = unpack_32_1x128 (mask); + __m128i unpk_dst = unpack_32_1x128 (dst); - return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst)); + return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst)); } -static force_inline void -core_combine_over_ca_sse2 (uint32_t* pd, - const uint32_t* ps, - const uint32_t *pm, - int w) +static void +sse2_combine_over_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, m, d; @@ -1695,11 +1560,6 @@ core_combine_over_ca_sse2 (uint32_t* pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -1710,18 +1570,8 @@ core_combine_over_ca_sse2 (uint32_t* pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -1763,19 +1613,21 @@ core_combine_over_reverse_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst) { - __m64 d = unpack_32_1x64 (dst); + __m128i d = unpack_32_1x128 (dst); - return pack_1x64_32 ( - over_1x64 (d, expand_alpha_1x64 (d), - pix_multiply_1x64 (unpack_32_1x64 (src), - unpack_32_1x64 (mask)))); + return pack_1x128_32 ( + over_1x128 (d, expand_alpha_1x128 (d), + pix_multiply_1x128 (unpack_32_1x128 (src), + unpack_32_1x128 (mask)))); } -static force_inline void -core_combine_over_reverse_ca_sse2 (uint32_t* pd, - const uint32_t* ps, - const uint32_t *pm, - int w) +static void +sse2_combine_over_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, m, d; @@ -1784,11 +1636,6 @@ core_combine_over_reverse_ca_sse2 (uint32_t* pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -1799,18 +1646,8 @@ core_combine_over_reverse_ca_sse2 (uint32_t* pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -1849,11 +1686,13 @@ core_combine_over_reverse_ca_sse2 (uint32_t* pd, } } -static force_inline void -core_combine_in_ca_sse2 (uint32_t * pd, - const uint32_t *ps, - const uint32_t *pm, - int w) +static void +sse2_combine_in_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, m, d; @@ -1862,37 +1701,22 @@ core_combine_in_ca_sse2 (uint32_t * pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)), - expand_alpha_1x64 (unpack_32_1x64 (d)))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)), + expand_alpha_1x128 (unpack_32_1x128 (d)))); w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -1927,21 +1751,23 @@ core_combine_in_ca_sse2 (uint32_t * pd, m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - pix_multiply_1x64 ( - unpack_32_1x64 (s), unpack_32_1x64 (m)), - expand_alpha_1x64 (unpack_32_1x64 (d)))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + pix_multiply_1x128 ( + unpack_32_1x128 (s), unpack_32_1x128 (m)), + expand_alpha_1x128 (unpack_32_1x128 (d)))); w--; } } -static force_inline void -core_combine_in_reverse_ca_sse2 (uint32_t * pd, - const uint32_t *ps, - const uint32_t *pm, - int w) +static void +sse2_combine_in_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, m, d; @@ -1950,37 +1776,22 @@ core_combine_in_reverse_ca_sse2 (uint32_t * pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (d), - pix_multiply_1x64 (unpack_32_1x64 (m), - expand_alpha_1x64 (unpack_32_1x64 (s))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (d), + pix_multiply_1x128 (unpack_32_1x128 (m), + expand_alpha_1x128 (unpack_32_1x128 (s))))); w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2014,20 +1825,22 @@ core_combine_in_reverse_ca_sse2 (uint32_t * pd, m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (d), - pix_multiply_1x64 (unpack_32_1x64 (m), - expand_alpha_1x64 (unpack_32_1x64 (s))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (d), + pix_multiply_1x128 (unpack_32_1x128 (m), + expand_alpha_1x128 (unpack_32_1x128 (s))))); w--; } } -static force_inline void -core_combine_out_ca_sse2 (uint32_t * pd, - const uint32_t *ps, - const uint32_t *pm, - int w) +static void +sse2_combine_out_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, m, d; @@ -2036,37 +1849,22 @@ core_combine_out_ca_sse2 (uint32_t * pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - pix_multiply_1x64 ( - unpack_32_1x64 (s), unpack_32_1x64 (m)), - negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + pix_multiply_1x128 ( + unpack_32_1x128 (s), unpack_32_1x128 (m)), + negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d))))); w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2102,21 +1900,23 @@ core_combine_out_ca_sse2 (uint32_t * pd, m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - pix_multiply_1x64 ( - unpack_32_1x64 (s), unpack_32_1x64 (m)), - negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + pix_multiply_1x128 ( + unpack_32_1x128 (s), unpack_32_1x128 (m)), + negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d))))); w--; } } -static force_inline void -core_combine_out_reverse_ca_sse2 (uint32_t * pd, - const uint32_t *ps, - const uint32_t *pm, - int w) +static void +sse2_combine_out_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, m, d; @@ -2125,38 +1925,23 @@ core_combine_out_reverse_ca_sse2 (uint32_t * pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (d), - negate_1x64 (pix_multiply_1x64 ( - unpack_32_1x64 (m), - expand_alpha_1x64 (unpack_32_1x64 (s)))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (d), + negate_1x128 (pix_multiply_1x128 ( + unpack_32_1x128 (m), + expand_alpha_1x128 (unpack_32_1x128 (s)))))); w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2194,12 +1979,12 @@ core_combine_out_reverse_ca_sse2 (uint32_t * pd, m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (d), - negate_1x64 (pix_multiply_1x64 ( - unpack_32_1x64 (m), - expand_alpha_1x64 (unpack_32_1x64 (s)))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (d), + negate_1x128 (pix_multiply_1x128 ( + unpack_32_1x128 (m), + expand_alpha_1x128 (unpack_32_1x128 (s)))))); w--; } } @@ -2209,23 +1994,25 @@ core_combine_atop_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst) { - __m64 m = unpack_32_1x64 (mask); - __m64 s = unpack_32_1x64 (src); - __m64 d = unpack_32_1x64 (dst); - __m64 sa = expand_alpha_1x64 (s); - __m64 da = expand_alpha_1x64 (d); + __m128i m = unpack_32_1x128 (mask); + __m128i s = unpack_32_1x128 (src); + __m128i d = unpack_32_1x128 (dst); + __m128i sa = expand_alpha_1x128 (s); + __m128i da = expand_alpha_1x128 (d); - s = pix_multiply_1x64 (s, m); - m = negate_1x64 (pix_multiply_1x64 (m, sa)); + s = pix_multiply_1x128 (s, m); + m = negate_1x128 (pix_multiply_1x128 (m, sa)); - return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da)); + return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da)); } -static force_inline void -core_combine_atop_ca_sse2 (uint32_t * pd, - const uint32_t *ps, - const uint32_t *pm, - int w) +static void +sse2_combine_atop_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, m, d; @@ -2235,11 +2022,6 @@ core_combine_atop_ca_sse2 (uint32_t * pd, __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2250,18 +2032,8 @@ core_combine_atop_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2314,24 +2086,26 @@ core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst) { - __m64 m = unpack_32_1x64 (mask); - __m64 s = unpack_32_1x64 (src); - __m64 d = unpack_32_1x64 (dst); + __m128i m = unpack_32_1x128 (mask); + __m128i s = unpack_32_1x128 (src); + __m128i d = unpack_32_1x128 (dst); - __m64 da = negate_1x64 (expand_alpha_1x64 (d)); - __m64 sa = expand_alpha_1x64 (s); + __m128i da = negate_1x128 (expand_alpha_1x128 (d)); + __m128i sa = expand_alpha_1x128 (s); - s = pix_multiply_1x64 (s, m); - m = pix_multiply_1x64 (m, sa); + s = pix_multiply_1x128 (s, m); + m = pix_multiply_1x128 (m, sa); - return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da)); + return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da)); } -static force_inline void -core_combine_reverse_atop_ca_sse2 (uint32_t * pd, - const uint32_t *ps, - const uint32_t *pm, - int w) +static void +sse2_combine_atop_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, m, d; @@ -2341,11 +2115,6 @@ core_combine_reverse_atop_ca_sse2 (uint32_t * pd, __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2356,18 +2125,8 @@ core_combine_reverse_atop_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2421,26 +2180,28 @@ core_combine_xor_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst) { - __m64 a = unpack_32_1x64 (mask); - __m64 s = unpack_32_1x64 (src); - __m64 d = unpack_32_1x64 (dst); + __m128i a = unpack_32_1x128 (mask); + __m128i s = unpack_32_1x128 (src); + __m128i d = unpack_32_1x128 (dst); - __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 ( - a, expand_alpha_1x64 (s))); - __m64 dest = pix_multiply_1x64 (s, a); - __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d)); + __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 ( + a, expand_alpha_1x128 (s))); + __m128i dest = pix_multiply_1x128 (s, a); + __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d)); - return pack_1x64_32 (pix_add_multiply_1x64 (&d, + return pack_1x128_32 (pix_add_multiply_1x128 (&d, &alpha_dst, &dest, &alpha_src)); } -static force_inline void -core_combine_xor_ca_sse2 (uint32_t * pd, - const uint32_t *ps, - const uint32_t *pm, - int w) +static void +sse2_combine_xor_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, m, d; @@ -2450,11 +2211,6 @@ core_combine_xor_ca_sse2 (uint32_t * pd, __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; @@ -2465,18 +2221,8 @@ core_combine_xor_ca_sse2 (uint32_t * pd, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); @@ -2527,11 +2273,13 @@ core_combine_xor_ca_sse2 (uint32_t * pd, } } -static force_inline void -core_combine_add_ca_sse2 (uint32_t * pd, - const uint32_t *ps, - const uint32_t *pm, - int w) +static void +sse2_combine_add_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * pd, + const uint32_t * ps, + const uint32_t * pm, + int w) { uint32_t s, m, d; @@ -2539,36 +2287,21 @@ core_combine_add_ca_sse2 (uint32_t * pd, __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { s = *ps++; m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s), - unpack_32_1x64 (m)), - unpack_32_1x64 (d))); + *pd++ = pack_1x128_32 ( + _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), + unpack_32_1x128 (m)), + unpack_32_1x128 (d))); w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)ps); - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)ps); - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*)pd); @@ -2598,342 +2331,52 @@ core_combine_add_ca_sse2 (uint32_t * pd, m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s), - unpack_32_1x64 (m)), - unpack_32_1x64 (d))); + *pd++ = pack_1x128_32 ( + _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), + unpack_32_1x128 (m)), + unpack_32_1x128 (d))); w--; } } -/* --------------------------------------------------- - * fb_compose_setup_sSE2 - */ -static force_inline __m64 -create_mask_16_64 (uint16_t mask) -{ - return _mm_set1_pi16 (mask); -} - static force_inline __m128i create_mask_16_128 (uint16_t mask) { return _mm_set1_epi16 (mask); } -static force_inline __m64 -create_mask_2x32_64 (uint32_t mask0, - uint32_t mask1) -{ - return _mm_set_pi32 (mask0, mask1); -} - +/* Work around a code generation bug in Sun Studio 12. */ +#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590) +# define create_mask_2x32_128(mask0, mask1) \ + (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1))) +#else static force_inline __m128i create_mask_2x32_128 (uint32_t mask0, uint32_t mask1) { return _mm_set_epi32 (mask0, mask1, mask0, mask1); } - -/* SSE2 code patch for fbcompose.c */ - -static void -sse2_combine_over_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_over_u_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_over_reverse_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_over_reverse_u_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_in_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_in_u_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_in_reverse_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_reverse_in_u_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_out_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_out_u_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_out_reverse_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_reverse_out_u_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_atop_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_atop_u_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_atop_reverse_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_reverse_atop_u_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_xor_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_xor_u_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_add_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_add_u_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_saturate_u (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_saturate_u_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_src_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_src_ca_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_over_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_over_ca_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_over_reverse_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_over_reverse_ca_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_in_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_in_ca_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_in_reverse_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_in_reverse_ca_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_out_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_out_ca_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_out_reverse_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_out_reverse_ca_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_atop_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_atop_ca_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_atop_reverse_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_reverse_atop_ca_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_xor_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_xor_ca_sse2 (dst, src, mask, width); - _mm_empty (); -} - -static void -sse2_combine_add_ca (pixman_implementation_t *imp, - pixman_op_t op, - uint32_t * dst, - const uint32_t * src, - const uint32_t * mask, - int width) -{ - core_combine_add_ca_sse2 (dst, src, mask, width); - _mm_empty (); -} - -/* ------------------------------------------------------------------- - * composite_over_n_8888 - */ +#endif static void sse2_composite_over_n_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) + pixman_composite_info_t *info) { + PIXMAN_COMPOSITE_ARGS (info); uint32_t src; uint32_t *dst_line, *dst, d; - uint16_t w; + int32_t w; int dst_stride; __m128i xmm_src, xmm_alpha; __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; - src = _pixman_image_get_solid (src_image, dst_image->bits.format); + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); if (src == 0) return; PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); xmm_src = expand_pixel_32_1x128 (src); xmm_alpha = expand_alpha_1x128 (xmm_src); @@ -2942,28 +2385,20 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp, { dst = dst_line; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - dst_line += dst_stride; w = width; while (w && (unsigned long)dst & 15) { d = *dst; - *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src), - _mm_movepi64_pi64 (xmm_alpha), - unpack_32_1x64 (d))); + *dst++ = pack_1x128_32 (over_1x128 (xmm_src, + xmm_alpha, + unpack_32_1x128 (d))); w--; } - cache_prefetch ((__m128i*)dst); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - xmm_dst = load_128_aligned ((__m128i*)dst); unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); @@ -2983,48 +2418,34 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp, while (w) { d = *dst; - *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src), - _mm_movepi64_pi64 (xmm_alpha), - unpack_32_1x64 (d))); + *dst++ = pack_1x128_32 (over_1x128 (xmm_src, + xmm_alpha, + unpack_32_1x128 (d))); w--; } } - _mm_empty (); } -/* --------------------------------------------------------------------- - * composite_over_n_0565 - */ static void sse2_composite_over_n_0565 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) + pixman_composite_info_t *info) { + PIXMAN_COMPOSITE_ARGS (info); uint32_t src; uint16_t *dst_line, *dst, d; - uint16_t w; + int32_t w; int dst_stride; __m128i xmm_src, xmm_alpha; __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; - src = _pixman_image_get_solid (src_image, dst_image->bits.format); + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); if (src == 0) return; PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); xmm_src = expand_pixel_32_1x128 (src); xmm_alpha = expand_alpha_1x128 (xmm_src); @@ -3033,9 +2454,6 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp, { dst = dst_line; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - dst_line += dst_stride; w = width; @@ -3044,20 +2462,14 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp, d = *dst; *dst++ = pack_565_32_16 ( - pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src), - _mm_movepi64_pi64 (xmm_alpha), - expand565_16_1x64 (d)))); + pack_1x128_32 (over_1x128 (xmm_src, + xmm_alpha, + expand565_16_1x128 (d)))); w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - while (w >= 8) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - xmm_dst = load_128_aligned ((__m128i*)dst); unpack_565_128_4x128 (xmm_dst, @@ -3083,61 +2495,43 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp, { d = *dst; *dst++ = pack_565_32_16 ( - pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src), - _mm_movepi64_pi64 (xmm_alpha), - expand565_16_1x64 (d)))); + pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha, + expand565_16_1x128 (d)))); } } - _mm_empty (); } -/* ------------------------------ - * composite_add_n_8888_8888_ca - */ static void sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) + pixman_composite_info_t *info) { - uint32_t src, srca; + PIXMAN_COMPOSITE_ARGS (info); + uint32_t src; uint32_t *dst_line, d; uint32_t *mask_line, m; uint32_t pack_cmp; int dst_stride, mask_stride; - __m128i xmm_src, xmm_alpha; + __m128i xmm_src; __m128i xmm_dst; __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; - __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; + __m128i mmx_src, mmx_mask, mmx_dest; + + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); - src = _pixman_image_get_solid (src_image, dst_image->bits.format); - srca = src >> 24; - if (src == 0) return; PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); xmm_src = _mm_unpacklo_epi8 ( create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); - xmm_alpha = expand_alpha_1x128 (xmm_src); - mmx_src = _mm_movepi64_pi64 (xmm_src); - mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); + mmx_src = xmm_src; while (height--) { @@ -3148,10 +2542,6 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, dst_line += dst_stride; mask_line += mask_stride; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { m = *pm++; @@ -3159,28 +2549,21 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, if (m) { d = *pd; - - mmx_mask = unpack_32_1x64 (m); - mmx_dest = unpack_32_1x64 (d); - *pd = pack_1x64_32 ( - _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest)); + mmx_mask = unpack_32_1x128 (m); + mmx_dest = unpack_32_1x128 (d); + + *pd = pack_1x128_32 ( + _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), + mmx_dest)); } pd++; w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_mask = load_128_unaligned ((__m128i*)pm); pack_cmp = @@ -3198,7 +2581,7 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, &xmm_mask_lo, &xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi); - + save_128_aligned ( (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst)); } @@ -3215,12 +2598,13 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, if (m) { d = *pd; - - mmx_mask = unpack_32_1x64 (m); - mmx_dest = unpack_32_1x64 (d); - *pd = pack_1x64_32 ( - _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest)); + mmx_mask = unpack_32_1x128 (m); + mmx_dest = unpack_32_1x128 (d); + + *pd = pack_1x128_32 ( + _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), + mmx_dest)); } pd++; @@ -3228,28 +2612,13 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, } } - _mm_empty (); } -/* --------------------------------------------------------------------------- - * composite_over_n_8888_8888_ca - */ - static void sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) + pixman_composite_info_t *info) { + PIXMAN_COMPOSITE_ARGS (info); uint32_t src; uint32_t *dst_line, d; uint32_t *mask_line, m; @@ -3260,23 +2629,23 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; - __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; + __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; - src = _pixman_image_get_solid (src_image, dst_image->bits.format); + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); if (src == 0) return; PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); xmm_src = _mm_unpacklo_epi8 ( create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); xmm_alpha = expand_alpha_1x128 (xmm_src); - mmx_src = _mm_movepi64_pi64 (xmm_src); - mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); + mmx_src = xmm_src; + mmx_alpha = xmm_alpha; while (height--) { @@ -3287,10 +2656,6 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, dst_line += dst_stride; mask_line += mask_stride; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w && (unsigned long)pd & 15) { m = *pm++; @@ -3298,10 +2663,10 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, if (m) { d = *pd; - mmx_mask = unpack_32_1x64 (m); - mmx_dest = unpack_32_1x64 (d); + mmx_mask = unpack_32_1x128 (m); + mmx_dest = unpack_32_1x128 (d); - *pd = pack_1x64_32 (in_over_1x64 (&mmx_src, + *pd = pack_1x128_32 (in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); @@ -3311,16 +2676,8 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)pd); - cache_prefetch ((__m128i*)pm); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)pd); - cache_prefetch_next ((__m128i*)pm); - xmm_mask = load_128_unaligned ((__m128i*)pm); pack_cmp = @@ -3356,11 +2713,11 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, if (m) { d = *pd; - mmx_mask = unpack_32_1x64 (m); - mmx_dest = unpack_32_1x64 (d); + mmx_mask = unpack_32_1x128 (m); + mmx_dest = unpack_32_1x128 (d); - *pd = pack_1x64_32 ( - in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); + *pd = pack_1x128_32 ( + in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); } pd++; @@ -3368,32 +2725,17 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, } } - _mm_empty (); } -/*--------------------------------------------------------------------- - * composite_over_8888_n_8888 - */ - static void sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) + pixman_composite_info_t *info) { + PIXMAN_COMPOSITE_ARGS (info); uint32_t *dst_line, *dst; uint32_t *src_line, *src; uint32_t mask; - uint16_t w; + int32_t w; int dst_stride, src_stride; __m128i xmm_mask; @@ -3402,11 +2744,11 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, __m128i xmm_alpha_lo, xmm_alpha_hi; PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); - mask = _pixman_image_get_solid (mask_image, dst_image->bits.format); + mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8); xmm_mask = create_mask_16_128 (mask >> 24); @@ -3418,111 +2760,156 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)src); - while (w && (unsigned long)dst & 15) { uint32_t s = *src++; - uint32_t d = *dst; - - __m64 ms = unpack_32_1x64 (s); - __m64 alpha = expand_alpha_1x64 (ms); - __m64 dest = _mm_movepi64_pi64 (xmm_mask); - __m64 alpha_dst = unpack_32_1x64 (d); - - *dst++ = pack_1x64_32 ( - in_over_1x64 (&ms, &alpha, &dest, &alpha_dst)); + if (s) + { + uint32_t d = *dst; + + __m128i ms = unpack_32_1x128 (s); + __m128i alpha = expand_alpha_1x128 (ms); + __m128i dest = xmm_mask; + __m128i alpha_dst = unpack_32_1x128 (d); + + *dst = pack_1x128_32 ( + in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); + } + dst++; w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)src); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - cache_prefetch_next ((__m128i*)src); - xmm_src = load_128_unaligned ((__m128i*)src); - xmm_dst = load_128_aligned ((__m128i*)dst); - - unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); - unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); - expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, - &xmm_alpha_lo, &xmm_alpha_hi); - - in_over_2x128 (&xmm_src_lo, &xmm_src_hi, - &xmm_alpha_lo, &xmm_alpha_hi, - &xmm_mask, &xmm_mask, - &xmm_dst_lo, &xmm_dst_hi); - - save_128_aligned ( - (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); - dst += 4; - src += 4; - w -= 4; - } + if (!is_zero (xmm_src)) + { + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_mask, &xmm_mask, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + dst += 4; + src += 4; + w -= 4; + } while (w) { uint32_t s = *src++; - uint32_t d = *dst; - __m64 ms = unpack_32_1x64 (s); - __m64 alpha = expand_alpha_1x64 (ms); - __m64 mask = _mm_movepi64_pi64 (xmm_mask); - __m64 dest = unpack_32_1x64 (d); + if (s) + { + uint32_t d = *dst; + + __m128i ms = unpack_32_1x128 (s); + __m128i alpha = expand_alpha_1x128 (ms); + __m128i mask = xmm_mask; + __m128i dest = unpack_32_1x128 (d); + + *dst = pack_1x128_32 ( + in_over_1x128 (&ms, &alpha, &mask, &dest)); + } + + dst++; + w--; + } + } + +} + +static void +sse2_composite_src_x888_8888 (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ + PIXMAN_COMPOSITE_ARGS (info); + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + int32_t w; + int dst_stride, src_stride; + + + PIXMAN_IMAGE_GET_LINE ( + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w && (unsigned long)dst & 15) + { + *dst++ = *src++ | 0xff000000; + w--; + } - *dst++ = pack_1x64_32 ( - in_over_1x64 (&ms, &alpha, &mask, &dest)); + while (w >= 16) + { + __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4; + + xmm_src1 = load_128_unaligned ((__m128i*)src + 0); + xmm_src2 = load_128_unaligned ((__m128i*)src + 1); + xmm_src3 = load_128_unaligned ((__m128i*)src + 2); + xmm_src4 = load_128_unaligned ((__m128i*)src + 3); + + save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000)); + save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000)); + save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000)); + save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000)); + + dst += 16; + src += 16; + w -= 16; + } + while (w) + { + *dst++ = *src++ | 0xff000000; w--; } } - _mm_empty (); } -/* --------------------------------------------------------------------- - * composite_over_x888_n_8888 - */ static void sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) + pixman_composite_info_t *info) { + PIXMAN_COMPOSITE_ARGS (info); uint32_t *dst_line, *dst; uint32_t *src_line, *src; uint32_t mask; int dst_stride, src_stride; - uint16_t w; + int32_t w; __m128i xmm_mask, xmm_alpha; __m128i xmm_src, xmm_src_lo, xmm_src_hi; __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); - mask = _pixman_image_get_solid (mask_image, dst_image->bits.format); + mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8); xmm_mask = create_mask_16_128 (mask >> 24); xmm_alpha = mask_00ff; @@ -3535,36 +2922,24 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)src); - while (w && (unsigned long)dst & 15) { uint32_t s = (*src++) | 0xff000000; uint32_t d = *dst; - __m64 src = unpack_32_1x64 (s); - __m64 alpha = _mm_movepi64_pi64 (xmm_alpha); - __m64 mask = _mm_movepi64_pi64 (xmm_mask); - __m64 dest = unpack_32_1x64 (d); + __m128i src = unpack_32_1x128 (s); + __m128i alpha = xmm_alpha; + __m128i mask = xmm_mask; + __m128i dest = unpack_32_1x128 (d); - *dst++ = pack_1x64_32 ( - in_over_1x64 (&src, &alpha, &mask, &dest)); + *dst++ = pack_1x128_32 ( + in_over_1x128 (&src, &alpha, &mask, &dest)); w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)src); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)dst); - cache_prefetch_next ((__m128i*)src); - xmm_src = _mm_or_si128 ( load_128_unaligned ((__m128i*)src), mask_ff000000); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -3591,45 +2966,31 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, uint32_t s = (*src++) | 0xff000000; uint32_t d = *dst; - __m64 src = unpack_32_1x64 (s); - __m64 alpha = _mm_movepi64_pi64 (xmm_alpha); - __m64 mask = _mm_movepi64_pi64 (xmm_mask); - __m64 dest = unpack_32_1x64 (d); + __m128i src = unpack_32_1x128 (s); + __m128i alpha = xmm_alpha; + __m128i mask = xmm_mask; + __m128i dest = unpack_32_1x128 (d); - *dst++ = pack_1x64_32 ( - in_over_1x64 (&src, &alpha, &mask, &dest)); + *dst++ = pack_1x128_32 ( + in_over_1x128 (&src, &alpha, &mask, &dest)); w--; } } - _mm_empty (); } -/* -------------------------------------------------------------------- - * composite_over_8888_8888 - */ static void sse2_composite_over_8888_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) + pixman_composite_info_t *info) { + PIXMAN_COMPOSITE_ARGS (info); int dst_stride, src_stride; uint32_t *dst_line, *dst; uint32_t *src_line, *src; PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); @@ -3638,76 +2999,49 @@ sse2_composite_over_8888_8888 (pixman_implementation_t *imp, while (height--) { - core_combine_over_u_sse2 (dst, src, NULL, width); + sse2_combine_over_u (imp, op, dst, src, NULL, width); dst += dst_stride; src += src_stride; } - _mm_empty (); } -/* ------------------------------------------------------------------ - * composite_over_8888_0565 - */ static force_inline uint16_t composite_over_8888_0565pixel (uint32_t src, uint16_t dst) { - __m64 ms; + __m128i ms; - ms = unpack_32_1x64 (src); + ms = unpack_32_1x128 (src); return pack_565_32_16 ( - pack_1x64_32 ( - over_1x64 ( - ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst)))); + pack_1x128_32 ( + over_1x128 ( + ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst)))); } static void sse2_composite_over_8888_0565 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) + pixman_composite_info_t *info) { + PIXMAN_COMPOSITE_ARGS (info); uint16_t *dst_line, *dst, d; uint32_t *src_line, *src, s; int dst_stride, src_stride; - uint16_t w; + int32_t w; __m128i xmm_alpha_lo, xmm_alpha_hi; __m128i xmm_src, xmm_src_lo, xmm_src_hi; __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); -#if 0 - /* FIXME - * - * I copy the code from MMX one and keep the fixme. - * If it's a problem there, probably is a problem here. - */ - assert (src_image->drawable == mask_image->drawable); -#endif - while (height--) { dst = dst_line; src = src_line; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - dst_line += dst_stride; src_line += src_stride; w = width; @@ -3723,17 +3057,9 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp, w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - /* It's a 8 pixel loop */ while (w >= 8) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - cache_prefetch_next ((__m128i*)dst); - /* I'm loading unaligned because I'm not sure * about the address alignment. */ @@ -3783,57 +3109,42 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp, } } - _mm_empty (); } -/* ----------------------------------------------------------------- - * composite_over_n_8_8888 - */ - static void sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) + pixman_composite_info_t *info) { + PIXMAN_COMPOSITE_ARGS (info); uint32_t src, srca; uint32_t *dst_line, *dst; uint8_t *mask_line, *mask; int dst_stride, mask_stride; - uint16_t w; + int32_t w; uint32_t m, d; __m128i xmm_src, xmm_alpha, xmm_def; __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; - __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; + __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; - src = _pixman_image_get_solid (src_image, dst_image->bits.format); + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); srca = src >> 24; if (src == 0) return; PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); xmm_def = create_mask_2x32_128 (src, src); xmm_src = expand_pixel_32_1x128 (src); xmm_alpha = expand_alpha_1x128 (xmm_src); - mmx_src = _mm_movepi64_pi64 (xmm_src); - mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); + mmx_src = xmm_src; + mmx_alpha = xmm_alpha; while (height--) { @@ -3843,10 +3154,6 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, mask_line += mask_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && (unsigned long)dst & 15) { uint8_t m = *mask++; @@ -3854,10 +3161,10 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, if (m) { d = *dst; - mmx_mask = expand_pixel_8_1x64 (m); - mmx_dest = unpack_32_1x64 (d); + mmx_mask = expand_pixel_8_1x128 (m); + mmx_dest = unpack_32_1x128 (d); - *dst = pack_1x64_32 (in_over_1x64 (&mmx_src, + *dst = pack_1x128_32 (in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); @@ -3867,16 +3174,8 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, dst++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - m = *((uint32_t*)mask); if (srca == 0xff && m == 0xffffffff) @@ -3917,10 +3216,10 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, if (m) { d = *dst; - mmx_mask = expand_pixel_8_1x64 (m); - mmx_dest = unpack_32_1x64 (d); + mmx_mask = expand_pixel_8_1x128 (m); + mmx_dest = unpack_32_1x128 (d); - *dst = pack_1x64_32 (in_over_1x64 (&mmx_src, + *dst = pack_1x128_32 (in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); @@ -3931,14 +3230,9 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, } } - _mm_empty (); } -/* ---------------------------------------------------------------- - * composite_over_n_8_8888 - */ - -pixman_bool_t +static pixman_bool_t pixman_fill_sse2 (uint32_t *bits, int stride, int bpp, @@ -3953,28 +3247,41 @@ pixman_fill_sse2 (uint32_t *bits, __m128i xmm_def; - if (bpp == 16 && (data >> 16 != (data & 0xffff))) - return FALSE; + if (bpp == 8) + { + uint8_t b; + uint16_t w; - if (bpp != 16 && bpp != 32) - return FALSE; + stride = stride * (int) sizeof (uint32_t) / 1; + byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); + byte_width = width; + stride *= 1; - if (bpp == 16) + b = data & 0xff; + w = (b << 8) | b; + data = (w << 16) | w; + } + else if (bpp == 16) { stride = stride * (int) sizeof (uint32_t) / 2; byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); byte_width = 2 * width; stride *= 2; + + data = (data & 0xffff) * 0x00010001; } - else + else if (bpp == 32) { stride = stride * (int) sizeof (uint32_t) / 4; byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); byte_width = 4 * width; stride *= 4; } + else + { + return FALSE; + } - cache_prefetch ((__m128i*)byte_line); xmm_def = create_mask_2x32_128 (data, data); while (height--) @@ -3984,8 +3291,12 @@ pixman_fill_sse2 (uint32_t *bits, byte_line += stride; w = byte_width; - - cache_prefetch_next ((__m128i*)d); + if (w >= 1 && ((unsigned long)d & 1)) + { + *(uint8_t *)d = data; + w -= 1; + d += 1; + } while (w >= 2 && ((unsigned long)d & 3)) { @@ -4002,12 +3313,8 @@ pixman_fill_sse2 (uint32_t *bits, d += 4; } - cache_prefetch_next ((__m128i*)d); - while (w >= 128) { - cache_prefetch (((__m128i*)d) + 12); - save_128_aligned ((__m128i*)(d), xmm_def); save_128_aligned ((__m128i*)(d + 16), xmm_def); save_128_aligned ((__m128i*)(d + 32), xmm_def); @@ -4023,8 +3330,6 @@ pixman_fill_sse2 (uint32_t *bits, if (w >= 64) { - cache_prefetch (((__m128i*)d) + 8); - save_128_aligned ((__m128i*)(d), xmm_def); save_128_aligned ((__m128i*)(d + 16), xmm_def); save_128_aligned ((__m128i*)(d + 32), xmm_def); @@ -4034,8 +3339,6 @@ pixman_fill_sse2 (uint32_t *bits, w -= 64; } - cache_prefetch_next ((__m128i*)d); - if (w >= 32) { save_128_aligned ((__m128i*)(d), xmm_def); @@ -4053,8 +3356,6 @@ pixman_fill_sse2 (uint32_t *bits, w -= 16; } - cache_prefetch_next ((__m128i*)d); - while (w >= 4) { *(uint32_t *)d = data; @@ -4069,50 +3370,46 @@ pixman_fill_sse2 (uint32_t *bits, w -= 2; d += 2; } + + if (w >= 1) + { + *(uint8_t *)d = data; + w -= 1; + d += 1; + } } - _mm_empty (); return TRUE; } static void sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) + pixman_composite_info_t *info) { + PIXMAN_COMPOSITE_ARGS (info); uint32_t src, srca; uint32_t *dst_line, *dst; uint8_t *mask_line, *mask; int dst_stride, mask_stride; - uint16_t w; + int32_t w; uint32_t m; __m128i xmm_src, xmm_def; __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; - src = _pixman_image_get_solid (src_image, dst_image->bits.format); + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); srca = src >> 24; if (src == 0) { - pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride, - PIXMAN_FORMAT_BPP (dst_image->bits.format), + pixman_fill_sse2 (dest_image->bits.bits, dest_image->bits.rowstride, + PIXMAN_FORMAT_BPP (dest_image->bits.format), dest_x, dest_y, width, height, 0); return; } PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); @@ -4127,19 +3424,14 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, mask_line += mask_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && (unsigned long)dst & 15) { uint8_t m = *mask++; if (m) { - *dst = pack_1x64_32 ( - pix_multiply_1x64 ( - _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m))); + *dst = pack_1x128_32 ( + pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m))); } else { @@ -4150,16 +3442,8 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, dst++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - m = *((uint32_t*)mask); if (srca == 0xff && m == 0xffffffff) @@ -4200,9 +3484,9 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, if (m) { - *dst = pack_1x64_32 ( - pix_multiply_1x64 ( - _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m))); + *dst = pack_1x128_32 ( + pix_multiply_1x128 ( + xmm_src, expand_pixel_8_1x128 (m))); } else { @@ -4214,55 +3498,39 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, } } - _mm_empty (); } -/*----------------------------------------------------------------------- - * composite_over_n_8_0565 - */ - static void sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) + pixman_composite_info_t *info) { - uint32_t src, srca; + PIXMAN_COMPOSITE_ARGS (info); + uint32_t src; uint16_t *dst_line, *dst, d; uint8_t *mask_line, *mask; int dst_stride, mask_stride; - uint16_t w; + int32_t w; uint32_t m; - __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; + __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; __m128i xmm_src, xmm_alpha; __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; - src = _pixman_image_get_solid (src_image, dst_image->bits.format); + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); - srca = src >> 24; if (src == 0) return; PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); xmm_src = expand_pixel_32_1x128 (src); xmm_alpha = expand_alpha_1x128 (xmm_src); - mmx_src = _mm_movepi64_pi64 (xmm_src); - mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); + mmx_src = xmm_src; + mmx_alpha = xmm_alpha; while (height--) { @@ -4272,10 +3540,6 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, mask_line += mask_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && (unsigned long)dst & 15) { m = *mask++; @@ -4283,12 +3547,12 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, if (m) { d = *dst; - mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m)); - mmx_dest = expand565_16_1x64 (d); + mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); + mmx_dest = expand565_16_1x128 (d); *dst = pack_565_32_16 ( - pack_1x64_32 ( - in_over_1x64 ( + pack_1x128_32 ( + in_over_1x128 ( &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); } @@ -4296,16 +3560,8 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, dst++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 8) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - xmm_dst = load_128_aligned ((__m128i*) dst); unpack_565_128_4x128 (xmm_dst, &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); @@ -4364,12 +3620,12 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, if (m) { d = *dst; - mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m)); - mmx_dest = expand565_16_1x64 (d); + mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); + mmx_dest = expand565_16_1x128 (d); *dst = pack_565_32_16 ( - pack_1x64_32 ( - in_over_1x64 ( + pack_1x128_32 ( + in_over_1x128 ( &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); } @@ -4378,52 +3634,28 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, } } - _mm_empty (); } -/* ----------------------------------------------------------------------- - * composite_over_pixbuf_0565 - */ - static void sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) + pixman_composite_info_t *info) { + PIXMAN_COMPOSITE_ARGS (info); uint16_t *dst_line, *dst, d; uint32_t *src_line, *src, s; int dst_stride, src_stride; - uint16_t w; + int32_t w; uint32_t opaque, zero; - __m64 ms; + __m128i ms; __m128i xmm_src, xmm_src_lo, xmm_src_hi; __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); -#if 0 - /* FIXME - * - * I copy the code from MMX one and keep the fixme. - * If it's a problem there, probably is a problem here. - */ - assert (src_image->drawable == mask_image->drawable); -#endif - while (height--) { dst = dst_line; @@ -4432,33 +3664,21 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w && (unsigned long)dst & 15) { s = *src++; d = *dst; - ms = unpack_32_1x64 (s); + ms = unpack_32_1x128 (s); *dst++ = pack_565_32_16 ( - pack_1x64_32 ( - over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d)))); + pack_1x128_32 ( + over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d)))); w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w >= 8) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - cache_prefetch_next ((__m128i*)dst); - /* First round */ xmm_src = load_128_unaligned ((__m128i*)src); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -4515,60 +3735,36 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, s = *src++; d = *dst; - ms = unpack_32_1x64 (s); + ms = unpack_32_1x128 (s); *dst++ = pack_565_32_16 ( - pack_1x64_32 ( - over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d)))); + pack_1x128_32 ( + over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d)))); w--; } } - _mm_empty (); } -/* ------------------------------------------------------------------------- - * composite_over_pixbuf_8888 - */ - static void sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) + pixman_composite_info_t *info) { + PIXMAN_COMPOSITE_ARGS (info); uint32_t *dst_line, *dst, d; uint32_t *src_line, *src, s; int dst_stride, src_stride; - uint16_t w; + int32_t w; uint32_t opaque, zero; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); -#if 0 - /* FIXME - * - * I copy the code from MMX one and keep the fixme. - * If it's a problem there, probably is a problem here. - */ - assert (src_image->drawable == mask_image->drawable); -#endif - while (height--) { dst = dst_line; @@ -4577,32 +3773,20 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w && (unsigned long)dst & 15) { s = *src++; d = *dst; - *dst++ = pack_1x64_32 ( - over_rev_non_pre_1x64 ( - unpack_32_1x64 (s), unpack_32_1x64 (d))); + *dst++ = pack_1x128_32 ( + over_rev_non_pre_1x128 ( + unpack_32_1x128 (s), unpack_32_1x128 (d))); w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - cache_prefetch_next ((__m128i*)dst); - xmm_src_hi = load_128_unaligned ((__m128i*)src); opaque = is_opaque (xmm_src_hi); @@ -4641,36 +3825,21 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, s = *src++; d = *dst; - *dst++ = pack_1x64_32 ( - over_rev_non_pre_1x64 ( - unpack_32_1x64 (s), unpack_32_1x64 (d))); + *dst++ = pack_1x128_32 ( + over_rev_non_pre_1x128 ( + unpack_32_1x128 (s), unpack_32_1x128 (d))); w--; } } - _mm_empty (); } -/* ------------------------------------------------------------------------------------------------- - * composite_over_n_8888_0565_ca - */ - static void sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) + pixman_composite_info_t *info) { + PIXMAN_COMPOSITE_ARGS (info); uint32_t src; uint16_t *dst_line, *dst, d; uint32_t *mask_line, *mask, m; @@ -4682,22 +3851,22 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; - __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; + __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; - src = _pixman_image_get_solid (src_image, dst_image->bits.format); + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); if (src == 0) return; PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); xmm_src = expand_pixel_32_1x128 (src); xmm_alpha = expand_alpha_1x128 (xmm_src); - mmx_src = _mm_movepi64_pi64 (xmm_src); - mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); + mmx_src = xmm_src; + mmx_alpha = xmm_alpha; while (height--) { @@ -4707,10 +3876,6 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, mask_line += mask_stride; dst_line += dst_stride; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { m = *(uint32_t *) mask; @@ -4718,12 +3883,12 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, if (m) { d = *dst; - mmx_mask = unpack_32_1x64 (m); - mmx_dest = expand565_16_1x64 (d); + mmx_mask = unpack_32_1x128 (m); + mmx_dest = expand565_16_1x128 (d); *dst = pack_565_32_16 ( - pack_1x64_32 ( - in_over_1x64 ( + pack_1x128_32 ( + in_over_1x128 ( &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); } @@ -4732,16 +3897,8 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, mask++; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 8) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - /* First round */ xmm_mask = load_128_unaligned ((__m128i*)mask); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -4795,12 +3952,12 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, if (m) { d = *dst; - mmx_mask = unpack_32_1x64 (m); - mmx_dest = expand565_16_1x64 (d); + mmx_mask = unpack_32_1x128 (m); + mmx_dest = expand565_16_1x128 (d); *dst = pack_565_32_16 ( - pack_1x64_32 ( - in_over_1x64 ( + pack_1x128_32 ( + in_over_1x128 ( &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); } @@ -4810,47 +3967,30 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, } } - _mm_empty (); } -/* ----------------------------------------------------------------------- - * composite_in_n_8_8 - */ - static void sse2_composite_in_n_8_8 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) + pixman_composite_info_t *info) { + PIXMAN_COMPOSITE_ARGS (info); uint8_t *dst_line, *dst; uint8_t *mask_line, *mask; int dst_stride, mask_stride; - uint16_t w, d, m; + uint32_t d, m; uint32_t src; - uint8_t sa; + int32_t w; __m128i xmm_alpha; __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); - src = _pixman_image_get_solid (src_image, dst_image->bits.format); - - sa = src >> 24; + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); @@ -4862,33 +4002,21 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp, mask_line += mask_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { m = (uint32_t) *mask++; d = (uint32_t) *dst; - *dst++ = (uint8_t) pack_1x64_32 ( - pix_multiply_1x64 ( - pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha), - unpack_32_1x64 (m)), - unpack_32_1x64 (d))); + *dst++ = (uint8_t) pack_1x128_32 ( + pix_multiply_1x128 ( + pix_multiply_1x128 (xmm_alpha, + unpack_32_1x128 (m)), + unpack_32_1x128 (d))); w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 16) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - xmm_mask = load_128_unaligned ((__m128i*)mask); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -4916,48 +4044,115 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp, m = (uint32_t) *mask++; d = (uint32_t) *dst; - *dst++ = (uint8_t) pack_1x64_32 ( - pix_multiply_1x64 ( - pix_multiply_1x64 ( - _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)), - unpack_32_1x64 (d))); + *dst++ = (uint8_t) pack_1x128_32 ( + pix_multiply_1x128 ( + pix_multiply_1x128 ( + xmm_alpha, unpack_32_1x128 (m)), + unpack_32_1x128 (d))); w--; } } - _mm_empty (); } -/* --------------------------------------------------------------------------- - * composite_in_8_8 - */ +static void +sse2_composite_in_n_8 (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ + PIXMAN_COMPOSITE_ARGS (info); + uint8_t *dst_line, *dst; + int dst_stride; + uint32_t d; + uint32_t src; + int32_t w; + + __m128i xmm_alpha; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + + PIXMAN_IMAGE_GET_LINE ( + dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + + xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); + + src = src >> 24; + + if (src == 0xff) + return; + + if (src == 0x00) + { + pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, + 8, dest_x, dest_y, width, height, src); + + return; + } + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + w = width; + + while (w && ((unsigned long)dst & 15)) + { + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x128_32 ( + pix_multiply_1x128 ( + xmm_alpha, + unpack_32_1x128 (d))); + w--; + } + + while (w >= 16) + { + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, + &xmm_dst_lo, &xmm_dst_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + + dst += 16; + w -= 16; + } + + while (w) + { + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x128_32 ( + pix_multiply_1x128 ( + xmm_alpha, + unpack_32_1x128 (d))); + w--; + } + } + +} static void sse2_composite_in_8_8 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) + pixman_composite_info_t *info) { + PIXMAN_COMPOSITE_ARGS (info); uint8_t *dst_line, *dst; uint8_t *src_line, *src; int src_stride, dst_stride; - uint16_t w; + int32_t w; uint32_t s, d; __m128i xmm_src, xmm_src_lo, xmm_src_hi; __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); @@ -4969,31 +4164,19 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp, src_line += src_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { s = (uint32_t) *src++; d = (uint32_t) *dst; - *dst++ = (uint8_t) pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (s), unpack_32_1x64 (d))); + *dst++ = (uint8_t) pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (s), unpack_32_1x128 (d))); w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - while (w >= 16) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - cache_prefetch_next ((__m128i*)dst); - xmm_src = load_128_unaligned ((__m128i*)src); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -5017,40 +4200,24 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp, s = (uint32_t) *src++; d = (uint32_t) *dst; - *dst++ = (uint8_t) pack_1x64_32 ( - pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d))); + *dst++ = (uint8_t) pack_1x128_32 ( + pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d))); w--; } } - _mm_empty (); } -/* ------------------------------------------------------------------------- - * composite_add_n_8_8 - */ - static void sse2_composite_add_n_8_8 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) + pixman_composite_info_t *info) { + PIXMAN_COMPOSITE_ARGS (info); uint8_t *dst_line, *dst; uint8_t *mask_line, *mask; int dst_stride, mask_stride; - uint16_t w; + int32_t w; uint32_t src; - uint8_t sa; uint32_t m, d; __m128i xmm_alpha; @@ -5058,13 +4225,11 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp, __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); - src = _pixman_image_get_solid (src_image, dst_image->bits.format); - - sa = src >> 24; + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); @@ -5076,33 +4241,21 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp, mask_line += mask_stride; w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w && ((unsigned long)dst & 15)) { m = (uint32_t) *mask++; d = (uint32_t) *dst; - *dst++ = (uint8_t) pack_1x64_32 ( - _mm_adds_pu16 ( - pix_multiply_1x64 ( - _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)), - unpack_32_1x64 (d))); + *dst++ = (uint8_t) pack_1x128_32 ( + _mm_adds_epu16 ( + pix_multiply_1x128 ( + xmm_alpha, unpack_32_1x128 (m)), + unpack_32_1x128 (d))); w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)mask); - cache_prefetch ((__m128i*)dst); - while (w >= 16) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)mask); - cache_prefetch_next ((__m128i*)dst); - xmm_mask = load_128_unaligned ((__m128i*)mask); xmm_dst = load_128_aligned ((__m128i*)dst); @@ -5129,107 +4282,148 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp, m = (uint32_t) *mask++; d = (uint32_t) *dst; - *dst++ = (uint8_t) pack_1x64_32 ( - _mm_adds_pu16 ( - pix_multiply_1x64 ( - _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)), - unpack_32_1x64 (d))); + *dst++ = (uint8_t) pack_1x128_32 ( + _mm_adds_epu16 ( + pix_multiply_1x128 ( + xmm_alpha, unpack_32_1x128 (m)), + unpack_32_1x128 (d))); w--; } } - _mm_empty (); } -/* ---------------------------------------------------------------------- - * composite_add_8000_8000 - */ - static void -sse2_composite_add_8000_8000 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) +sse2_composite_add_n_8 (pixman_implementation_t *imp, + pixman_composite_info_t *info) { + PIXMAN_COMPOSITE_ARGS (info); uint8_t *dst_line, *dst; - uint8_t *src_line, *src; - int dst_stride, src_stride; - uint16_t w; - uint16_t t; + int dst_stride; + int32_t w; + uint32_t src; + + __m128i xmm_src; PIXMAN_IMAGE_GET_LINE ( - src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); - PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); - while (height--) + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + + src >>= 24; + + if (src == 0x00) + return; + + if (src == 0xff) { - dst = dst_line; - src = src_line; + pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, + 8, dest_x, dest_y, width, height, 0xff); + + return; + } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); + src = (src << 24) | (src << 16) | (src << 8) | src; + xmm_src = _mm_set_epi32 (src, src, src, src); + while (height--) + { + dst = dst_line; dst_line += dst_stride; - src_line += src_stride; w = width; - /* Small head */ - while (w && (unsigned long)dst & 3) + while (w && ((unsigned long)dst & 15)) { - t = (*dst) + (*src++); - *dst++ = t | (0 - (t >> 8)); + *dst = (uint8_t)_mm_cvtsi128_si32 ( + _mm_adds_epu8 ( + xmm_src, + _mm_cvtsi32_si128 (*dst))); + w--; + dst++; } - core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2); - - /* Small tail */ - dst += w & 0xfffc; - src += w & 0xfffc; + while (w >= 16) + { + save_128_aligned ( + (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst))); - w &= 3; + dst += 16; + w -= 16; + } while (w) { - t = (*dst) + (*src++); - *dst++ = t | (0 - (t >> 8)); + *dst = (uint8_t)_mm_cvtsi128_si32 ( + _mm_adds_epu8 ( + xmm_src, + _mm_cvtsi32_si128 (*dst))); + + w--; + dst++; + } + } + +} + +static void +sse2_composite_add_8_8 (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ + PIXMAN_COMPOSITE_ARGS (info); + uint8_t *dst_line, *dst; + uint8_t *src_line, *src; + int dst_stride, src_stride; + int32_t w; + uint16_t t; + + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE ( + dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + src = src_line; + + dst_line += dst_stride; + src_line += src_stride; + w = width; + + /* Small head */ + while (w && (unsigned long)dst & 3) + { + t = (*dst) + (*src++); + *dst++ = t | (0 - (t >> 8)); + w--; + } + + sse2_combine_add_u (imp, op, + (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2); + + /* Small tail */ + dst += w & 0xfffc; + src += w & 0xfffc; + + w &= 3; + + while (w) + { + t = (*dst) + (*src++); + *dst++ = t | (0 - (t >> 8)); w--; } } - _mm_empty (); } -/* --------------------------------------------------------------------- - * composite_add_8888_8888 - */ static void sse2_composite_add_8888_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) + pixman_composite_info_t *info) { + PIXMAN_COMPOSITE_ARGS (info); uint32_t *dst_line, *dst; uint32_t *src_line, *src; int dst_stride, src_stride; @@ -5237,7 +4431,7 @@ sse2_composite_add_8888_8888 (pixman_implementation_t *imp, PIXMAN_IMAGE_GET_LINE ( src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); while (height--) { @@ -5246,16 +4440,11 @@ sse2_composite_add_8888_8888 (pixman_implementation_t *imp, src = src_line; src_line += src_stride; - core_combine_add_u_sse2 (dst, src, NULL, width); + sse2_combine_add_u (imp, op, dst, src, NULL, width); } - _mm_empty (); } -/* ------------------------------------------------------------------------------------------------- - * sse2_composite_copy_area - */ - static pixman_bool_t pixman_blt_sse2 (uint32_t *src_bits, uint32_t *dst_bits, @@ -5265,8 +4454,8 @@ pixman_blt_sse2 (uint32_t *src_bits, int dst_bpp, int src_x, int src_y, - int dst_x, - int dst_y, + int dest_x, + int dest_y, int width, int height) { @@ -5282,7 +4471,7 @@ pixman_blt_sse2 (uint32_t *src_bits, src_stride = src_stride * (int) sizeof (uint32_t) / 2; dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); - dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); + dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); byte_width = 2 * width; src_stride *= 2; dst_stride *= 2; @@ -5292,7 +4481,7 @@ pixman_blt_sse2 (uint32_t *src_bits, src_stride = src_stride * (int) sizeof (uint32_t) / 4; dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); - dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); + dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); byte_width = 4 * width; src_stride *= 4; dst_stride *= 4; @@ -5302,9 +4491,6 @@ pixman_blt_sse2 (uint32_t *src_bits, return FALSE; } - cache_prefetch ((__m128i*)src_bytes); - cache_prefetch ((__m128i*)dst_bytes); - while (height--) { int w; @@ -5314,9 +4500,6 @@ pixman_blt_sse2 (uint32_t *src_bits, dst_bytes += dst_stride; w = byte_width; - cache_prefetch_next ((__m128i*)s); - cache_prefetch_next ((__m128i*)d); - while (w >= 2 && ((unsigned long)d & 3)) { *(uint16_t *)d = *(uint16_t *)s; @@ -5334,17 +4517,10 @@ pixman_blt_sse2 (uint32_t *src_bits, d += 4; } - cache_prefetch_next ((__m128i*)s); - cache_prefetch_next ((__m128i*)d); - while (w >= 64) { __m128i xmm0, xmm1, xmm2, xmm3; - /* 128 bytes ahead */ - cache_prefetch (((__m128i*)s) + 8); - cache_prefetch (((__m128i*)d) + 8); - xmm0 = load_128_unaligned ((__m128i*)(s)); xmm1 = load_128_unaligned ((__m128i*)(s + 16)); xmm2 = load_128_unaligned ((__m128i*)(s + 32)); @@ -5360,9 +4536,6 @@ pixman_blt_sse2 (uint32_t *src_bits, w -= 64; } - cache_prefetch_next ((__m128i*)s); - cache_prefetch_next ((__m128i*)d); - while (w >= 16) { save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) ); @@ -5372,9 +4545,6 @@ pixman_blt_sse2 (uint32_t *src_bits, s += 16; } - cache_prefetch_next ((__m128i*)s); - cache_prefetch_next ((__m128i*)d); - while (w >= 4) { *(uint32_t *)d = *(uint32_t *)s; @@ -5393,63 +4563,43 @@ pixman_blt_sse2 (uint32_t *src_bits, } } - _mm_empty (); return TRUE; } static void sse2_composite_copy_area (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) + pixman_composite_info_t *info) { + PIXMAN_COMPOSITE_ARGS (info); pixman_blt_sse2 (src_image->bits.bits, - dst_image->bits.bits, + dest_image->bits.bits, src_image->bits.rowstride, - dst_image->bits.rowstride, + dest_image->bits.rowstride, PIXMAN_FORMAT_BPP (src_image->bits.format), - PIXMAN_FORMAT_BPP (dst_image->bits.format), + PIXMAN_FORMAT_BPP (dest_image->bits.format), src_x, src_y, dest_x, dest_y, width, height); } static void sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src_image, - pixman_image_t * mask_image, - pixman_image_t * dst_image, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) + pixman_composite_info_t *info) { + PIXMAN_COMPOSITE_ARGS (info); uint32_t *src, *src_line, s; uint32_t *dst, *dst_line, d; uint8_t *mask, *mask_line; uint32_t m; int src_stride, mask_stride, dst_stride; - uint16_t w; + int32_t w; + __m128i ms; __m128i xmm_src, xmm_src_lo, xmm_src_hi; __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; PIXMAN_IMAGE_GET_LINE ( - dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); PIXMAN_IMAGE_GET_LINE ( @@ -5466,45 +4616,30 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, w = width; - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)mask); - while (w && (unsigned long)dst & 15) { s = 0xff000000 | *src++; m = (uint32_t) *mask++; d = *dst; - - __m64 ms = unpack_32_1x64 (s); + ms = unpack_32_1x128 (s); if (m != 0xff) { - __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m)); - __m64 md = unpack_32_1x64 (d); + __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); + __m128i md = unpack_32_1x128 (d); - ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md); + ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md); } - *dst++ = pack_1x64_32 (ms); + *dst++ = pack_1x128_32 (ms); w--; } - /* call prefetch hint to optimize cache load*/ - cache_prefetch ((__m128i*)src); - cache_prefetch ((__m128i*)dst); - cache_prefetch ((__m128i*)mask); - while (w >= 4) { - /* fill cache line with next memory */ - cache_prefetch_next ((__m128i*)src); - cache_prefetch_next ((__m128i*)dst); - cache_prefetch_next ((__m128i*)mask); - m = *(uint32_t*) mask; - xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000); + xmm_src = _mm_or_si128 ( + load_128_unaligned ((__m128i*)src), mask_ff000000); if (m == 0xffffffff) { @@ -5520,9 +4655,12 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); - expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + expand_alpha_rev_2x128 ( + xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); - in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, + &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, + &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); } @@ -5547,15 +4685,15 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, } else { - __m64 ma, md, ms; + __m128i ma, md, ms; d = *dst; - ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m)); - md = unpack_32_1x64 (d); - ms = unpack_32_1x64 (s); + ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); + md = unpack_32_1x128 (d); + ms = unpack_32_1x128 (s); - *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md)); + *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md)); } } @@ -5566,261 +4704,1368 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, } } - _mm_empty (); } -static const pixman_fast_path_t sse2_fast_paths[] = +static void +sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, + pixman_composite_info_t *info) { - { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, sse2_composite_over_n_8_0565, 0 }, - { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, sse2_composite_over_n_8_0565, 0 }, - { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888, 0 }, - { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888, 0 }, - { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_n_0565, 0 }, - { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_8888_8888, 0 }, - { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_8888_8888, 0 }, - { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_over_8888_8888, 0 }, - { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_over_8888_8888, 0 }, - { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_8888_0565, 0 }, - { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_over_8888_0565, 0 }, - { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8_8888, 0 }, - { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8_8888, 0 }, - { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8_8888, 0 }, - { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8_8888, 0 }, - { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_x888_8_8888, 0 }, - { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888, 0 }, - { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_x888_8_8888, 0 }, - { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888, 0 }, - { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK }, - { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK }, - { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK }, - { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK }, - { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK }, - { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK }, - { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK }, - { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK }, - { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA }, - { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA }, - { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA }, - { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA }, - { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA }, - { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA }, - { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF }, - { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF }, - { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF }, - { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF }, - { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF }, - { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF }, - { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF }, - { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF }, - { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF }, - { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF }, - { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF }, - { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF }, - { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 }, - { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 }, - - { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_add_n_8888_8888_ca, NEED_COMPONENT_ALPHA }, - { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_add_8000_8000, 0 }, - { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_add_8888_8888, 0 }, - { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_add_8888_8888, 0 }, - { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_composite_add_n_8_8, 0 }, - - { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_src_n_8_8888, 0 }, - { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_src_n_8_8888, 0 }, - { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_src_n_8_8888, 0 }, - { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_src_n_8_8888, 0 }, - { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_copy_area, 0 }, - { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_copy_area, 0 }, - { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 }, - { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 }, - { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 }, - { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 }, - { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_copy_area, 0 }, - { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_copy_area, 0 }, - - { PIXMAN_OP_IN, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_in_8_8, 0 }, - { PIXMAN_OP_IN, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_composite_in_n_8_8, 0 }, + PIXMAN_COMPOSITE_ARGS (info); + uint32_t *src, *src_line, s; + uint32_t *dst, *dst_line, d; + uint8_t *mask, *mask_line; + uint32_t m; + int src_stride, mask_stride, dst_stride; + int32_t w; - { PIXMAN_OP_NONE }, -}; + __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; -/* - * Work around GCC bug causing crashes in Mozilla with SSE2 - * - * When using -msse, gcc generates movdqa instructions assuming that - * the stack is 16 byte aligned. Unfortunately some applications, such - * as Mozilla and Mono, end up aligning the stack to 4 bytes, which - * causes the movdqa instructions to fail. - * - * The __force_align_arg_pointer__ makes gcc generate a prologue that - * realigns the stack pointer to 16 bytes. - * - * On x86-64 this is not necessary because the standard ABI already - * calls for a 16 byte aligned stack. - * - * See https://bugs.freedesktop.org/show_bug.cgi?id=15693 - */ -#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) -__attribute__((__force_align_arg_pointer__)) -#endif -static void -sse2_composite (pixman_implementation_t *imp, - pixman_op_t op, - pixman_image_t * src, - pixman_image_t * mask, - pixman_image_t * dest, - int32_t src_x, - int32_t src_y, - int32_t mask_x, - int32_t mask_y, - int32_t dest_x, - int32_t dest_y, - int32_t width, - int32_t height) -{ - if (_pixman_run_fast_path (sse2_fast_paths, imp, - op, src, mask, dest, - src_x, src_y, - mask_x, mask_y, - dest_x, dest_y, - width, height)) + PIXMAN_IMAGE_GET_LINE ( + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) { - return; - } + src = src_line; + src_line += src_stride; + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; - _pixman_implementation_composite (imp->delegate, op, - src, mask, dest, - src_x, src_y, - mask_x, mask_y, - dest_x, dest_y, - width, height); -} + w = width; -#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) -__attribute__((__force_align_arg_pointer__)) -#endif -static pixman_bool_t -sse2_blt (pixman_implementation_t *imp, - uint32_t * src_bits, - uint32_t * dst_bits, - int src_stride, - int dst_stride, - int src_bpp, - int dst_bpp, - int src_x, - int src_y, - int dst_x, - int dst_y, - int width, - int height) -{ - if (!pixman_blt_sse2 ( - src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp, - src_x, src_y, dst_x, dst_y, width, height)) + while (w && (unsigned long)dst & 15) + { + uint32_t sa; - { - return _pixman_implementation_blt ( - imp->delegate, - src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp, - src_x, src_y, dst_x, dst_y, width, height); - } + s = *src++; + m = (uint32_t) *mask++; + d = *dst; - return TRUE; -} + sa = s >> 24; -#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) -__attribute__((__force_align_arg_pointer__)) -#endif -static pixman_bool_t -sse2_fill (pixman_implementation_t *imp, - uint32_t * bits, - int stride, - int bpp, - int x, - int y, - int width, - int height, - uint32_t xor) -{ - if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor)) - { - return _pixman_implementation_fill ( - imp->delegate, bits, stride, bpp, x, y, width, height, xor); + if (m) + { + if (sa == 0xff && m == 0xff) + { + *dst = s; + } + else + { + __m128i ms, md, ma, msa; + + ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); + ms = unpack_32_1x128 (s); + md = unpack_32_1x128 (d); + + msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); + + *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); + } + } + + dst++; + w--; + } + + while (w >= 4) + { + m = *(uint32_t *) mask; + + if (m) + { + xmm_src = load_128_unaligned ((__m128i*)src); + + if (m == 0xffffffff && is_opaque (xmm_src)) + { + save_128_aligned ((__m128i *)dst, xmm_src); + } + else + { + xmm_dst = load_128_aligned ((__m128i *)dst); + + xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, + &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + } + + src += 4; + dst += 4; + mask += 4; + w -= 4; + } + + while (w) + { + uint32_t sa; + + s = *src++; + m = (uint32_t) *mask++; + d = *dst; + + sa = s >> 24; + + if (m) + { + if (sa == 0xff && m == 0xff) + { + *dst = s; + } + else + { + __m128i ms, md, ma, msa; + + ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); + ms = unpack_32_1x128 (s); + md = unpack_32_1x128 (d); + + msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); + + *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); + } + } + + dst++; + w--; + } } - return TRUE; } -#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) -__attribute__((__force_align_arg_pointer__)) -#endif -pixman_implementation_t * -_pixman_implementation_create_sse2 (void) +static void +sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp, + pixman_composite_info_t *info) { - pixman_implementation_t *mmx = _pixman_implementation_create_mmx (); - pixman_implementation_t *imp = _pixman_implementation_create (mmx); + PIXMAN_COMPOSITE_ARGS (info); + uint32_t src; + uint32_t *dst_line, *dst; + __m128i xmm_src; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_dsta_hi, xmm_dsta_lo; + int dst_stride; + int32_t w; - /* SSE2 constants */ - mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000); - mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000); - mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0); - mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f); - mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000); - mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00); - mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8); - mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0); - mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000); - mask_0080 = create_mask_16_128 (0x0080); - mask_00ff = create_mask_16_128 (0x00ff); - mask_0101 = create_mask_16_128 (0x0101); - mask_ffff = create_mask_16_128 (0xffff); - mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000); - mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000); + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); - /* MMX constants */ - mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f); - mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840); + if (src == 0) + return; - mask_x0080 = create_mask_16_64 (0x0080); - mask_x00ff = create_mask_16_64 (0x00ff); - mask_x0101 = create_mask_16_64 (0x0101); - mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000); + PIXMAN_IMAGE_GET_LINE ( + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - _mm_empty (); + xmm_src = expand_pixel_32_1x128 (src); - /* Set up function pointers */ + while (height--) + { + dst = dst_line; - /* SSE code patch for fbcompose.c */ - imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u; - imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u; - imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u; - imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u; - imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u; - imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u; - imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u; - imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u; - imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u; - imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u; + dst_line += dst_stride; + w = width; - imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u; + while (w && (unsigned long)dst & 15) + { + __m128i vd; - imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca; - imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca; - imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca; - imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca; - imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca; - imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca; - imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca; - imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca; - imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca; - imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca; - imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca; + vd = unpack_32_1x128 (*dst); - imp->composite = sse2_composite; - imp->blt = sse2_blt; - imp->fill = sse2_fill; + *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), + xmm_src)); + w--; + dst++; + } - return imp; -} + while (w >= 4) + { + __m128i tmp_lo, tmp_hi; -#endif /* USE_SSE2 */ + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi); + + tmp_lo = xmm_src; + tmp_hi = xmm_src; + + over_2x128 (&xmm_dst_lo, &xmm_dst_hi, + &xmm_dsta_lo, &xmm_dsta_hi, + &tmp_lo, &tmp_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi)); + + w -= 4; + dst += 4; + } + + while (w) + { + __m128i vd; + + vd = unpack_32_1x128 (*dst); + + *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), + xmm_src)); + w--; + dst++; + } + + } + +} + +static void +sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ + PIXMAN_COMPOSITE_ARGS (info); + uint32_t *src, *src_line, s; + uint32_t *dst, *dst_line, d; + uint32_t *mask, *mask_line; + uint32_t m; + int src_stride, mask_stride, dst_stride; + int32_t w; + + __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + + PIXMAN_IMAGE_GET_LINE ( + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + src = src_line; + src_line += src_stride; + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + + w = width; + + while (w && (unsigned long)dst & 15) + { + uint32_t sa; + + s = *src++; + m = (*mask++) >> 24; + d = *dst; + + sa = s >> 24; + + if (m) + { + if (sa == 0xff && m == 0xff) + { + *dst = s; + } + else + { + __m128i ms, md, ma, msa; + + ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); + ms = unpack_32_1x128 (s); + md = unpack_32_1x128 (d); + + msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); + + *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); + } + } + + dst++; + w--; + } + + while (w >= 4) + { + xmm_mask = load_128_unaligned ((__m128i*)mask); + + if (!is_transparent (xmm_mask)) + { + xmm_src = load_128_unaligned ((__m128i*)src); + + if (is_opaque (xmm_mask) && is_opaque (xmm_src)) + { + save_128_aligned ((__m128i *)dst, xmm_src); + } + else + { + xmm_dst = load_128_aligned ((__m128i *)dst); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); + expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, + &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + } + + src += 4; + dst += 4; + mask += 4; + w -= 4; + } + + while (w) + { + uint32_t sa; + + s = *src++; + m = (*mask++) >> 24; + d = *dst; + + sa = s >> 24; + + if (m) + { + if (sa == 0xff && m == 0xff) + { + *dst = s; + } + else + { + __m128i ms, md, ma, msa; + + ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); + ms = unpack_32_1x128 (s); + md = unpack_32_1x128 (d); + + msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); + + *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); + } + } + + dst++; + w--; + } + } + +} + +/* A variant of 'sse2_combine_over_u' with minor tweaks */ +static force_inline void +scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd, + const uint32_t* ps, + int32_t w, + pixman_fixed_t vx, + pixman_fixed_t unit_x, + pixman_fixed_t max_vx, + pixman_bool_t fully_transparent_src) +{ + uint32_t s, d; + const uint32_t* pm = NULL; + + __m128i xmm_dst_lo, xmm_dst_hi; + __m128i xmm_src_lo, xmm_src_hi; + __m128i xmm_alpha_lo, xmm_alpha_hi; + + if (fully_transparent_src) + return; + + /* Align dst on a 16-byte boundary */ + while (w && ((unsigned long)pd & 15)) + { + d = *pd; + s = combine1 (ps + (vx >> 16), pm); + vx += unit_x; + + *pd++ = core_combine_over_u_pixel_sse2 (s, d); + if (pm) + pm++; + w--; + } + + while (w >= 4) + { + __m128i tmp; + uint32_t tmp1, tmp2, tmp3, tmp4; + + tmp1 = ps[vx >> 16]; + vx += unit_x; + tmp2 = ps[vx >> 16]; + vx += unit_x; + tmp3 = ps[vx >> 16]; + vx += unit_x; + tmp4 = ps[vx >> 16]; + vx += unit_x; + + tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); + + xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm); + + if (is_opaque (xmm_src_hi)) + { + save_128_aligned ((__m128i*)pd, xmm_src_hi); + } + else if (!is_zero (xmm_src_hi)) + { + xmm_dst_hi = load_128_aligned ((__m128i*) pd); + + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 ( + xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); + + over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_dst_lo, &xmm_dst_hi); + + /* rebuid the 4 pixel data and save*/ + save_128_aligned ((__m128i*)pd, + pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + w -= 4; + pd += 4; + if (pm) + pm += 4; + } + + while (w) + { + d = *pd; + s = combine1 (ps + (vx >> 16), pm); + vx += unit_x; + + *pd++ = core_combine_over_u_pixel_sse2 (s, d); + if (pm) + pm++; + + w--; + } +} + +FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER, + scaled_nearest_scanline_sse2_8888_8888_OVER, + uint32_t, uint32_t, COVER) +FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER, + scaled_nearest_scanline_sse2_8888_8888_OVER, + uint32_t, uint32_t, NONE) +FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER, + scaled_nearest_scanline_sse2_8888_8888_OVER, + uint32_t, uint32_t, PAD) + +static force_inline void +scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, + uint32_t * dst, + const uint32_t * src, + int32_t w, + pixman_fixed_t vx, + pixman_fixed_t unit_x, + pixman_fixed_t max_vx, + pixman_bool_t zero_src) +{ + __m128i xmm_mask; + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_alpha_lo, xmm_alpha_hi; + + if (zero_src || (*mask >> 24) == 0) + return; + + xmm_mask = create_mask_16_128 (*mask >> 24); + + while (w && (unsigned long)dst & 15) + { + uint32_t s = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + + if (s) + { + uint32_t d = *dst; + + __m128i ms = unpack_32_1x128 (s); + __m128i alpha = expand_alpha_1x128 (ms); + __m128i dest = xmm_mask; + __m128i alpha_dst = unpack_32_1x128 (d); + + *dst = pack_1x128_32 ( + in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); + } + dst++; + w--; + } + + while (w >= 4) + { + uint32_t tmp1, tmp2, tmp3, tmp4; + + tmp1 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp2 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp3 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp4 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + + xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); + + if (!is_zero (xmm_src)) + { + xmm_dst = load_128_aligned ((__m128i*)dst); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi); + + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, + &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_mask, &xmm_mask, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ( + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + dst += 4; + w -= 4; + } + + while (w) + { + uint32_t s = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + + if (s) + { + uint32_t d = *dst; + + __m128i ms = unpack_32_1x128 (s); + __m128i alpha = expand_alpha_1x128 (ms); + __m128i mask = xmm_mask; + __m128i dest = unpack_32_1x128 (d); + + *dst = pack_1x128_32 ( + in_over_1x128 (&ms, &alpha, &mask, &dest)); + } + + dst++; + w--; + } + +} + +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER, + scaled_nearest_scanline_sse2_8888_n_8888_OVER, + uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE) +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER, + scaled_nearest_scanline_sse2_8888_n_8888_OVER, + uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE) +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, + scaled_nearest_scanline_sse2_8888_n_8888_OVER, + uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE) + +#define BILINEAR_DECLARE_VARIABLES \ + const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \ + const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \ + const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);\ + const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1); \ + const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x, \ + unit_x, unit_x, unit_x, unit_x); \ + const __m128i xmm_zero = _mm_setzero_si128 (); \ + __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx) + +#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \ +do { \ + __m128i xmm_wh, xmm_lo, xmm_hi, a; \ + /* fetch 2x2 pixel block into sse2 register */ \ + uint32_t tl = src_top [pixman_fixed_to_int (vx)]; \ + uint32_t tr = src_top [pixman_fixed_to_int (vx) + 1]; \ + uint32_t bl = src_bottom [pixman_fixed_to_int (vx)]; \ + uint32_t br = src_bottom [pixman_fixed_to_int (vx) + 1]; \ + a = _mm_set_epi32 (tr, tl, br, bl); \ + vx += unit_x; \ + /* vertical interpolation */ \ + a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero), \ + xmm_wt), \ + _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero), \ + xmm_wb)); \ + /* calculate horizontal weights */ \ + xmm_wh = _mm_add_epi16 (xmm_addc, \ + _mm_xor_si128 (xmm_xorc, \ + _mm_srli_epi16 (xmm_x, 8))); \ + xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ + /* horizontal interpolation */ \ + xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \ + xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \ + a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \ + _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \ + /* shift and pack the result */ \ + a = _mm_srli_epi32 (a, 16); \ + a = _mm_packs_epi32 (a, a); \ + a = _mm_packus_epi16 (a, a); \ + pix = _mm_cvtsi128_si32 (a); \ +} while (0) + +#define BILINEAR_SKIP_ONE_PIXEL() \ +do { \ + vx += unit_x; \ + xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ +} while(0) + +static force_inline void +scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst, + const uint32_t * mask, + const uint32_t * src_top, + const uint32_t * src_bottom, + int32_t w, + int wt, + int wb, + pixman_fixed_t vx, + pixman_fixed_t unit_x, + pixman_fixed_t max_vx, + pixman_bool_t zero_src) +{ + BILINEAR_DECLARE_VARIABLES; + uint32_t pix1, pix2, pix3, pix4; + + while ((w -= 4) >= 0) + { + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); + BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); + BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); + BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); + *dst++ = pix1; + *dst++ = pix2; + *dst++ = pix3; + *dst++ = pix4; + } + + if (w & 2) + { + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); + BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); + *dst++ = pix1; + *dst++ = pix2; + } + + if (w & 1) + { + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); + *dst = pix1; + } + +} + +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC, + scaled_bilinear_scanline_sse2_8888_8888_SRC, + uint32_t, uint32_t, uint32_t, + COVER, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC, + scaled_bilinear_scanline_sse2_8888_8888_SRC, + uint32_t, uint32_t, uint32_t, + PAD, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC, + scaled_bilinear_scanline_sse2_8888_8888_SRC, + uint32_t, uint32_t, uint32_t, + NONE, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC, + scaled_bilinear_scanline_sse2_8888_8888_SRC, + uint32_t, uint32_t, uint32_t, + NORMAL, FLAG_NONE) + +static force_inline void +scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst, + const uint32_t * mask, + const uint32_t * src_top, + const uint32_t * src_bottom, + int32_t w, + int wt, + int wb, + pixman_fixed_t vx, + pixman_fixed_t unit_x, + pixman_fixed_t max_vx, + pixman_bool_t zero_src) +{ + BILINEAR_DECLARE_VARIABLES; + uint32_t pix1, pix2, pix3, pix4; + + while (w && ((unsigned long)dst & 15)) + { + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); + + if (pix1) + { + pix2 = *dst; + *dst = core_combine_over_u_pixel_sse2 (pix1, pix2); + } + + w--; + dst++; + } + + while (w >= 4) + { + __m128i xmm_src; + __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo; + __m128i xmm_alpha_hi, xmm_alpha_lo; + + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); + BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); + BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); + BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); + + xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); + + if (!is_zero (xmm_src)) + { + if (is_opaque (xmm_src)) + { + save_128_aligned ((__m128i *)dst, xmm_src); + } + else + { + __m128i xmm_dst = load_128_aligned ((__m128i *)dst); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); + over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi, + &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + } + + w -= 4; + dst += 4; + } + + while (w) + { + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); + + if (pix1) + { + pix2 = *dst; + *dst = core_combine_over_u_pixel_sse2 (pix1, pix2); + } + + w--; + dst++; + } +} + +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER, + scaled_bilinear_scanline_sse2_8888_8888_OVER, + uint32_t, uint32_t, uint32_t, + COVER, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER, + scaled_bilinear_scanline_sse2_8888_8888_OVER, + uint32_t, uint32_t, uint32_t, + PAD, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER, + scaled_bilinear_scanline_sse2_8888_8888_OVER, + uint32_t, uint32_t, uint32_t, + NONE, FLAG_NONE) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER, + scaled_bilinear_scanline_sse2_8888_8888_OVER, + uint32_t, uint32_t, uint32_t, + NORMAL, FLAG_NONE) + +static force_inline void +scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst, + const uint8_t * mask, + const uint32_t * src_top, + const uint32_t * src_bottom, + int32_t w, + int wt, + int wb, + pixman_fixed_t vx, + pixman_fixed_t unit_x, + pixman_fixed_t max_vx, + pixman_bool_t zero_src) +{ + BILINEAR_DECLARE_VARIABLES; + uint32_t pix1, pix2, pix3, pix4; + uint32_t m; + + while (w && ((unsigned long)dst & 15)) + { + uint32_t sa; + + m = (uint32_t) *mask++; + + if (m) + { + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); + sa = pix1 >> 24; + + if (sa == 0xff && m == 0xff) + { + *dst = pix1; + } + else + { + __m128i ms, md, ma, msa; + + pix2 = *dst; + ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); + ms = unpack_32_1x128 (pix1); + md = unpack_32_1x128 (pix2); + + msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); + + *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); + } + } + else + { + BILINEAR_SKIP_ONE_PIXEL (); + } + + w--; + dst++; + } + + while (w >= 4) + { + __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + + m = *(uint32_t*)mask; + + if (m) + { + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); + BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); + BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); + BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); + + xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); + + if (m == 0xffffffff && is_opaque (xmm_src)) + { + save_128_aligned ((__m128i *)dst, xmm_src); + } + else + { + xmm_dst = load_128_aligned ((__m128i *)dst); + + xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, + &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + } + else + { + BILINEAR_SKIP_ONE_PIXEL (); + BILINEAR_SKIP_ONE_PIXEL (); + BILINEAR_SKIP_ONE_PIXEL (); + BILINEAR_SKIP_ONE_PIXEL (); + } + + w -= 4; + dst += 4; + mask += 4; + } + + while (w) + { + uint32_t sa; + + m = (uint32_t) *mask++; + + if (m) + { + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); + sa = pix1 >> 24; + + if (sa == 0xff && m == 0xff) + { + *dst = pix1; + } + else + { + __m128i ms, md, ma, msa; + + pix2 = *dst; + ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); + ms = unpack_32_1x128 (pix1); + md = unpack_32_1x128 (pix2); + + msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); + + *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); + } + } + else + { + BILINEAR_SKIP_ONE_PIXEL (); + } + + w--; + dst++; + } +} + +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER, + scaled_bilinear_scanline_sse2_8888_8_8888_OVER, + uint32_t, uint8_t, uint32_t, + COVER, FLAG_HAVE_NON_SOLID_MASK) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER, + scaled_bilinear_scanline_sse2_8888_8_8888_OVER, + uint32_t, uint8_t, uint32_t, + PAD, FLAG_HAVE_NON_SOLID_MASK) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER, + scaled_bilinear_scanline_sse2_8888_8_8888_OVER, + uint32_t, uint8_t, uint32_t, + NONE, FLAG_HAVE_NON_SOLID_MASK) +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER, + scaled_bilinear_scanline_sse2_8888_8_8888_OVER, + uint32_t, uint8_t, uint32_t, + NORMAL, FLAG_HAVE_NON_SOLID_MASK) + +static const pixman_fast_path_t sse2_fast_paths[] = +{ + /* PIXMAN_OP_OVER */ + PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565), + PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca), + PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888), + PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888), + PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888), + PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888), + PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565), + PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), + + /* PIXMAN_OP_OVER_REVERSE */ + PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888), + PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888), + + /* PIXMAN_OP_ADD */ + PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca), + PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8), + PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888), + PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888), + PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8), + PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8), + + /* PIXMAN_OP_SRC */ + PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area), + PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area), + + /* PIXMAN_OP_IN */ + PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8), + PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8), + PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8), + + SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), + + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), + + SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888), + SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888), + SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888), + + SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), + SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), + SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), + SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), + + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888), + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888), + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888), + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888), + + { PIXMAN_OP_NONE }, +}; + +static pixman_bool_t +sse2_blt (pixman_implementation_t *imp, + uint32_t * src_bits, + uint32_t * dst_bits, + int src_stride, + int dst_stride, + int src_bpp, + int dst_bpp, + int src_x, + int src_y, + int dest_x, + int dest_y, + int width, + int height) +{ + if (!pixman_blt_sse2 ( + src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp, + src_x, src_y, dest_x, dest_y, width, height)) + + { + return _pixman_implementation_blt ( + imp->delegate, + src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp, + src_x, src_y, dest_x, dest_y, width, height); + } + + return TRUE; +} + +#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) +__attribute__((__force_align_arg_pointer__)) +#endif +static pixman_bool_t +sse2_fill (pixman_implementation_t *imp, + uint32_t * bits, + int stride, + int bpp, + int x, + int y, + int width, + int height, + uint32_t xor) +{ + if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor)) + { + return _pixman_implementation_fill ( + imp->delegate, bits, stride, bpp, x, y, width, height, xor); + } + + return TRUE; +} + +static uint32_t * +sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask) +{ + int w = iter->width; + __m128i ff000000 = mask_ff000000; + uint32_t *dst = iter->buffer; + uint32_t *src = (uint32_t *)iter->bits; + + iter->bits += iter->stride; + + while (w && ((unsigned long)dst) & 0x0f) + { + *dst++ = (*src++) | 0xff000000; + w--; + } + + while (w >= 4) + { + save_128_aligned ( + (__m128i *)dst, _mm_or_si128 ( + load_128_unaligned ((__m128i *)src), ff000000)); + + dst += 4; + src += 4; + w -= 4; + } + + while (w) + { + *dst++ = (*src++) | 0xff000000; + w--; + } + + return iter->buffer; +} + +static uint32_t * +sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask) +{ + int w = iter->width; + uint32_t *dst = iter->buffer; + uint16_t *src = (uint16_t *)iter->bits; + __m128i ff000000 = mask_ff000000; + + iter->bits += iter->stride; + + while (w && ((unsigned long)dst) & 0x0f) + { + uint16_t s = *src++; + + *dst++ = CONVERT_0565_TO_8888 (s); + w--; + } + + while (w >= 8) + { + __m128i lo, hi, s; + + s = _mm_loadu_si128 ((__m128i *)src); + + lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ())); + hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ())); + + save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000)); + save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000)); + + dst += 8; + src += 8; + w -= 8; + } + + while (w) + { + uint16_t s = *src++; + + *dst++ = CONVERT_0565_TO_8888 (s); + w--; + } + + return iter->buffer; +} + +static uint32_t * +sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) +{ + int w = iter->width; + uint32_t *dst = iter->buffer; + uint8_t *src = iter->bits; + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; + + iter->bits += iter->stride; + + while (w && (((unsigned long)dst) & 15)) + { + *dst++ = *(src++) << 24; + w--; + } + + while (w >= 16) + { + xmm0 = _mm_loadu_si128((__m128i *)src); + + xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0); + xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0); + xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1); + xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1); + xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2); + xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2); + + _mm_store_si128(((__m128i *)(dst + 0)), xmm3); + _mm_store_si128(((__m128i *)(dst + 4)), xmm4); + _mm_store_si128(((__m128i *)(dst + 8)), xmm5); + _mm_store_si128(((__m128i *)(dst + 12)), xmm6); + + dst += 16; + src += 16; + w -= 16; + } + + while (w) + { + *dst++ = *(src++) << 24; + w--; + } + + return iter->buffer; +} + +typedef struct +{ + pixman_format_code_t format; + pixman_iter_get_scanline_t get_scanline; +} fetcher_info_t; + +static const fetcher_info_t fetchers[] = +{ + { PIXMAN_x8r8g8b8, sse2_fetch_x8r8g8b8 }, + { PIXMAN_r5g6b5, sse2_fetch_r5g6b5 }, + { PIXMAN_a8, sse2_fetch_a8 }, + { PIXMAN_null } +}; + +static void +sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter) +{ + pixman_image_t *image = iter->image; + int x = iter->x; + int y = iter->y; + int width = iter->width; + int height = iter->height; + +#define FLAGS \ + (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | FAST_PATH_BITS_IMAGE) + + if ((iter->flags & ITER_NARROW) && + (image->common.flags & FLAGS) == FLAGS && + x >= 0 && y >= 0 && + x + width <= image->bits.width && + y + height <= image->bits.height) + { + const fetcher_info_t *f; + + for (f = &fetchers[0]; f->format != PIXMAN_null; f++) + { + if (image->common.extended_format_code == f->format) + { + uint8_t *b = (uint8_t *)image->bits.bits; + int s = image->bits.rowstride * 4; + + iter->bits = b + s * iter->y + x * PIXMAN_FORMAT_BPP (f->format) / 8; + iter->stride = s; + + iter->get_scanline = f->get_scanline; + return; + } + } + } + + imp->delegate->src_iter_init (imp->delegate, iter); +} + +#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) +__attribute__((__force_align_arg_pointer__)) +#endif +pixman_implementation_t * +_pixman_implementation_create_sse2 (pixman_implementation_t *fallback) +{ + pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths); + + /* SSE2 constants */ + mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000); + mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000); + mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0); + mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f); + mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000); + mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00); + mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8); + mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0); + mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000); + mask_0080 = create_mask_16_128 (0x0080); + mask_00ff = create_mask_16_128 (0x00ff); + mask_0101 = create_mask_16_128 (0x0101); + mask_ffff = create_mask_16_128 (0xffff); + mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000); + mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000); + + /* Set up function pointers */ + imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u; + imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u; + imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u; + imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u; + imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u; + imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u; + imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u; + imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u; + imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u; + imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u; + + imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u; + + imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca; + imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca; + imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca; + imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca; + imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca; + imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca; + imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca; + + imp->blt = sse2_blt; + imp->fill = sse2_fill; + + imp->src_iter_init = sse2_src_iter_init; + + return imp; +}