From f88ae14c15040345a12ff0488c7b23d25639e49b Mon Sep 17 00:00:00 2001 From: =?utf8?q?S=C3=B8ren=20Sandmann=20Pedersen?= Date: Fri, 18 Feb 2011 03:56:20 -0500 Subject: [PATCH] sse2: Convert all uses of MMX registers to use SSE2 registers instead. By avoiding use of MMX registers we won't need to call emms all over the place, which avoids various miscompilation issues. --- pixman/pixman-sse2.c | 788 ++++++++++++++++++++++++++++----------------------- 1 file changed, 440 insertions(+), 348 deletions(-) diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c index c4ff3c1..283e4c4 100644 --- a/pixman/pixman-sse2.c +++ b/pixman/pixman-sse2.c @@ -407,6 +407,12 @@ load_32_1x64 (uint32_t data) return _mm_cvtsi32_si64 (data); } +static force_inline __m128i +load_32_1x128 (uint32_t data) +{ + return _mm_cvtsi32_si128 (data); +} + static force_inline __m64 unpack_32_1x64 (uint32_t data) { @@ -425,6 +431,12 @@ expand_alpha_rev_1x64 (__m64 data) return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0)); } +static force_inline __m128i +expand_alpha_rev_1x128 (__m128i data) +{ + return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0)); +} + static force_inline __m64 expand_pixel_8_1x64 (uint8_t data) { @@ -432,6 +444,13 @@ expand_pixel_8_1x64 (uint8_t data) unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0)); } +static force_inline __m128i +expand_pixel_8_1x128 (uint8_t data) +{ + return _mm_shufflelo_epi16 ( + unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0)); +} + static force_inline __m64 pix_multiply_1x64 (__m64 data, __m64 alpha) @@ -441,6 +460,15 @@ pix_multiply_1x64 (__m64 data, mask_x0101); } +static force_inline __m128i +pix_multiply_1x128 (__m128i data, + __m128i alpha) +{ + return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha), + mask_0080), + mask_0101); +} + static force_inline __m64 pix_add_multiply_1x64 (__m64* src, __m64* alpha_dst, @@ -453,24 +481,54 @@ pix_add_multiply_1x64 (__m64* src, return _mm_adds_pu8 (t1, t2); } +static force_inline __m128i +pix_add_multiply_1x128 (__m128i* src, + __m128i* alpha_dst, + __m128i* dst, + __m128i* alpha_src) +{ + __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst); + __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src); + + return _mm_adds_epu8 (t1, t2); +} + static force_inline __m64 negate_1x64 (__m64 data) { return _mm_xor_si64 (data, mask_x00ff); } +static force_inline __m128i +negate_1x128 (__m128i data) +{ + return _mm_xor_si128 (data, mask_00ff); +} + static force_inline __m64 invert_colors_1x64 (__m64 data) { return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2)); } +static force_inline __m128i +invert_colors_1x128 (__m128i data) +{ + return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2)); +} + static force_inline __m64 over_1x64 (__m64 src, __m64 alpha, __m64 dst) { return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha))); } +static force_inline __m128i +over_1x128 (__m128i src, __m128i alpha, __m128i dst) +{ + return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha))); +} + static force_inline __m64 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst) { @@ -479,6 +537,14 @@ in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst) *dst); } +static force_inline __m128i +in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst) +{ + return over_1x128 (pix_multiply_1x128 (*src, *mask), + pix_multiply_1x128 (*alpha, *mask), + *dst); +} + static force_inline __m64 over_rev_non_pre_1x64 (__m64 src, __m64 dst) { @@ -490,12 +556,29 @@ over_rev_non_pre_1x64 (__m64 src, __m64 dst) dst); } +static force_inline __m128i +over_rev_non_pre_1x128 (__m128i src, __m128i dst) +{ + __m128i alpha = expand_alpha_1x128 (src); + + return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src), + _mm_or_si128 (alpha, mask_alpha)), + alpha, + dst); +} + static force_inline uint32_t pack_1x64_32 (__m64 data) { return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ())); } +static force_inline uint32_t +pack_1x128_32 (__m128i data) +{ + return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ())); +} + /* Expand 16 bits positioned at @pos (0-3) of a mmx register into * * 00RR00GG00BB @@ -529,6 +612,16 @@ expand565_16_1x64 (uint16_t pixel) return _mm_srli_pi16 (p, 8); } +static force_inline __m128i +expand565_16_1x128 (uint16_t pixel) +{ + __m128i m = _mm_cvtsi32_si128 (pixel); + + m = unpack_565_to_8888 (m); + + return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ()); +} + /* ---------------------------------------------------------------------------- * Compose Core transformations */ @@ -536,7 +629,7 @@ static force_inline uint32_t core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst) { uint8_t a; - __m64 ms; + __m128i xmms; a = src >> 24; @@ -546,9 +639,10 @@ core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst) } else if (src) { - ms = unpack_32_1x64 (src); - return pack_1x64_32 ( - over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst))); + xmms = unpack_32_1x128 (src); + return pack_1x128_32 ( + over_1x128 (xmms, expand_alpha_1x128 (xmms), + unpack_32_1x128 (dst))); } return dst; @@ -561,15 +655,15 @@ combine1 (const uint32_t *ps, const uint32_t *pm) if (pm) { - __m64 ms, mm; + __m128i ms, mm; - mm = unpack_32_1x64 (*pm); - mm = expand_alpha_1x64 (mm); + mm = unpack_32_1x128 (*pm); + mm = expand_alpha_1x128 (mm); - ms = unpack_32_1x64 (s); - ms = pix_multiply_1x64 (ms, mm); + ms = unpack_32_1x128 (s); + ms = pix_multiply_1x128 (ms, mm); - s = pack_1x64_32 (ms); + s = pack_1x128_32 (ms); } return s; @@ -857,9 +951,9 @@ core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst) } else if (maska != 0xff) { - return pack_1x64_32 ( - pix_multiply_1x64 (unpack_32_1x64 (dst), - expand_alpha_1x64 (unpack_32_1x64 (src)))); + return pack_1x128_32 ( + pix_multiply_1x128 (unpack_32_1x128 (dst), + expand_alpha_1x128 (unpack_32_1x128 (src)))); } return dst; @@ -994,10 +1088,10 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd, uint32_t s = combine1 (ps, pm); uint32_t d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (d), negate_1x64 ( - expand_alpha_1x64 (unpack_32_1x64 (s))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (d), negate_1x128 ( + expand_alpha_1x128 (unpack_32_1x128 (s))))); if (pm) pm++; @@ -1039,10 +1133,10 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd, uint32_t s = combine1 (ps, pm); uint32_t d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (d), negate_1x64 ( - expand_alpha_1x64 (unpack_32_1x64 (s))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (d), negate_1x128 ( + expand_alpha_1x128 (unpack_32_1x128 (s))))); ps++; if (pm) pm++; @@ -1061,10 +1155,10 @@ core_combine_out_u_sse2 (uint32_t* pd, uint32_t s = combine1 (ps, pm); uint32_t d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (s), negate_1x64 ( - expand_alpha_1x64 (unpack_32_1x64 (d))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (s), negate_1x128 ( + expand_alpha_1x128 (unpack_32_1x128 (d))))); w--; ps++; if (pm) @@ -1104,10 +1198,10 @@ core_combine_out_u_sse2 (uint32_t* pd, uint32_t s = combine1 (ps, pm); uint32_t d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (s), negate_1x64 ( - expand_alpha_1x64 (unpack_32_1x64 (d))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (s), negate_1x128 ( + expand_alpha_1x128 (unpack_32_1x128 (d))))); w--; ps++; if (pm) @@ -1119,13 +1213,13 @@ static force_inline uint32_t core_combine_atop_u_pixel_sse2 (uint32_t src, uint32_t dst) { - __m64 s = unpack_32_1x64 (src); - __m64 d = unpack_32_1x64 (dst); + __m128i s = unpack_32_1x128 (src); + __m128i d = unpack_32_1x128 (dst); - __m64 sa = negate_1x64 (expand_alpha_1x64 (s)); - __m64 da = expand_alpha_1x64 (d); + __m128i sa = negate_1x128 (expand_alpha_1x128 (s)); + __m128i da = expand_alpha_1x128 (d); - return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa)); + return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa)); } static force_inline void @@ -1201,13 +1295,13 @@ static force_inline uint32_t core_combine_reverse_atop_u_pixel_sse2 (uint32_t src, uint32_t dst) { - __m64 s = unpack_32_1x64 (src); - __m64 d = unpack_32_1x64 (dst); + __m128i s = unpack_32_1x128 (src); + __m128i d = unpack_32_1x128 (dst); - __m64 sa = expand_alpha_1x64 (s); - __m64 da = negate_1x64 (expand_alpha_1x64 (d)); + __m128i sa = expand_alpha_1x128 (s); + __m128i da = negate_1x128 (expand_alpha_1x128 (d)); - return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa)); + return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa)); } static force_inline void @@ -1283,13 +1377,13 @@ static force_inline uint32_t core_combine_xor_u_pixel_sse2 (uint32_t src, uint32_t dst) { - __m64 s = unpack_32_1x64 (src); - __m64 d = unpack_32_1x64 (dst); + __m128i s = unpack_32_1x128 (src); + __m128i d = unpack_32_1x128 (dst); - __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d)); - __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s)); + __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d)); + __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s)); - return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s)); + return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s)); } static force_inline void @@ -1387,8 +1481,8 @@ core_combine_add_u_sse2 (uint32_t* dst, ps++; if (pm) pm++; - *pd++ = _mm_cvtsi64_si32 ( - _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d))); + *pd++ = _mm_cvtsi128_si32 ( + _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d))); w--; } @@ -1414,8 +1508,8 @@ core_combine_add_u_sse2 (uint32_t* dst, d = *pd; ps++; - *pd++ = _mm_cvtsi64_si32 ( - _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d))); + *pd++ = _mm_cvtsi128_si32 ( + _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d))); if (pm) pm++; } @@ -1425,18 +1519,18 @@ static force_inline uint32_t core_combine_saturate_u_pixel_sse2 (uint32_t src, uint32_t dst) { - __m64 ms = unpack_32_1x64 (src); - __m64 md = unpack_32_1x64 (dst); + __m128i ms = unpack_32_1x128 (src); + __m128i md = unpack_32_1x128 (dst); uint32_t sa = src >> 24; uint32_t da = ~dst >> 24; if (sa > da) { - ms = pix_multiply_1x64 ( - ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24))); + ms = pix_multiply_1x128 ( + ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24))); } - return pack_1x64_32 (_mm_adds_pu16 (md, ms)); + return pack_1x128_32 (_mm_adds_epu16 (md, ms)); } static force_inline void @@ -1540,8 +1634,8 @@ core_combine_src_ca_sse2 (uint32_t* pd, { s = *ps++; m = *pm++; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); w--; } @@ -1570,8 +1664,8 @@ core_combine_src_ca_sse2 (uint32_t* pd, { s = *ps++; m = *pm++; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); w--; } } @@ -1581,12 +1675,12 @@ core_combine_over_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst) { - __m64 s = unpack_32_1x64 (src); - __m64 expAlpha = expand_alpha_1x64 (s); - __m64 unpk_mask = unpack_32_1x64 (mask); - __m64 unpk_dst = unpack_32_1x64 (dst); + __m128i s = unpack_32_1x128 (src); + __m128i expAlpha = expand_alpha_1x128 (s); + __m128i unpk_mask = unpack_32_1x128 (mask); + __m128i unpk_dst = unpack_32_1x128 (dst); - return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst)); + return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst)); } static force_inline void @@ -1655,12 +1749,12 @@ core_combine_over_reverse_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst) { - __m64 d = unpack_32_1x64 (dst); + __m128i d = unpack_32_1x128 (dst); - return pack_1x64_32 ( - over_1x64 (d, expand_alpha_1x64 (d), - pix_multiply_1x64 (unpack_32_1x64 (src), - unpack_32_1x64 (mask)))); + return pack_1x128_32 ( + over_1x128 (d, expand_alpha_1x128 (d), + pix_multiply_1x128 (unpack_32_1x128 (src), + unpack_32_1x128 (mask)))); } static force_inline void @@ -1745,10 +1839,10 @@ core_combine_in_ca_sse2 (uint32_t * pd, m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)), - expand_alpha_1x64 (unpack_32_1x64 (d)))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)), + expand_alpha_1x128 (unpack_32_1x128 (d)))); w--; } @@ -1789,11 +1883,11 @@ core_combine_in_ca_sse2 (uint32_t * pd, m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - pix_multiply_1x64 ( - unpack_32_1x64 (s), unpack_32_1x64 (m)), - expand_alpha_1x64 (unpack_32_1x64 (d)))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + pix_multiply_1x128 ( + unpack_32_1x128 (s), unpack_32_1x128 (m)), + expand_alpha_1x128 (unpack_32_1x128 (d)))); w--; } @@ -1818,11 +1912,11 @@ core_combine_in_reverse_ca_sse2 (uint32_t * pd, m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (d), - pix_multiply_1x64 (unpack_32_1x64 (m), - expand_alpha_1x64 (unpack_32_1x64 (s))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (d), + pix_multiply_1x128 (unpack_32_1x128 (m), + expand_alpha_1x128 (unpack_32_1x128 (s))))); w--; } @@ -1861,11 +1955,11 @@ core_combine_in_reverse_ca_sse2 (uint32_t * pd, m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (d), - pix_multiply_1x64 (unpack_32_1x64 (m), - expand_alpha_1x64 (unpack_32_1x64 (s))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (d), + pix_multiply_1x128 (unpack_32_1x128 (m), + expand_alpha_1x128 (unpack_32_1x128 (s))))); w--; } } @@ -1889,11 +1983,11 @@ core_combine_out_ca_sse2 (uint32_t * pd, m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - pix_multiply_1x64 ( - unpack_32_1x64 (s), unpack_32_1x64 (m)), - negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + pix_multiply_1x128 ( + unpack_32_1x128 (s), unpack_32_1x128 (m)), + negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d))))); w--; } @@ -1934,11 +2028,11 @@ core_combine_out_ca_sse2 (uint32_t * pd, m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - pix_multiply_1x64 ( - unpack_32_1x64 (s), unpack_32_1x64 (m)), - negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + pix_multiply_1x128 ( + unpack_32_1x128 (s), unpack_32_1x128 (m)), + negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d))))); w--; } @@ -1963,12 +2057,12 @@ core_combine_out_reverse_ca_sse2 (uint32_t * pd, m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (d), - negate_1x64 (pix_multiply_1x64 ( - unpack_32_1x64 (m), - expand_alpha_1x64 (unpack_32_1x64 (s)))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (d), + negate_1x128 (pix_multiply_1x128 ( + unpack_32_1x128 (m), + expand_alpha_1x128 (unpack_32_1x128 (s)))))); w--; } @@ -2011,12 +2105,12 @@ core_combine_out_reverse_ca_sse2 (uint32_t * pd, m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (d), - negate_1x64 (pix_multiply_1x64 ( - unpack_32_1x64 (m), - expand_alpha_1x64 (unpack_32_1x64 (s)))))); + *pd++ = pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (d), + negate_1x128 (pix_multiply_1x128 ( + unpack_32_1x128 (m), + expand_alpha_1x128 (unpack_32_1x128 (s)))))); w--; } } @@ -2026,16 +2120,16 @@ core_combine_atop_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst) { - __m64 m = unpack_32_1x64 (mask); - __m64 s = unpack_32_1x64 (src); - __m64 d = unpack_32_1x64 (dst); - __m64 sa = expand_alpha_1x64 (s); - __m64 da = expand_alpha_1x64 (d); + __m128i m = unpack_32_1x128 (mask); + __m128i s = unpack_32_1x128 (src); + __m128i d = unpack_32_1x128 (dst); + __m128i sa = expand_alpha_1x128 (s); + __m128i da = expand_alpha_1x128 (d); - s = pix_multiply_1x64 (s, m); - m = negate_1x64 (pix_multiply_1x64 (m, sa)); + s = pix_multiply_1x128 (s, m); + m = negate_1x128 (pix_multiply_1x128 (m, sa)); - return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da)); + return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da)); } static force_inline void @@ -2116,17 +2210,17 @@ core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst) { - __m64 m = unpack_32_1x64 (mask); - __m64 s = unpack_32_1x64 (src); - __m64 d = unpack_32_1x64 (dst); + __m128i m = unpack_32_1x128 (mask); + __m128i s = unpack_32_1x128 (src); + __m128i d = unpack_32_1x128 (dst); - __m64 da = negate_1x64 (expand_alpha_1x64 (d)); - __m64 sa = expand_alpha_1x64 (s); + __m128i da = negate_1x128 (expand_alpha_1x128 (d)); + __m128i sa = expand_alpha_1x128 (s); - s = pix_multiply_1x64 (s, m); - m = pix_multiply_1x64 (m, sa); + s = pix_multiply_1x128 (s, m); + m = pix_multiply_1x128 (m, sa); - return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da)); + return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da)); } static force_inline void @@ -2208,16 +2302,16 @@ core_combine_xor_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst) { - __m64 a = unpack_32_1x64 (mask); - __m64 s = unpack_32_1x64 (src); - __m64 d = unpack_32_1x64 (dst); + __m128i a = unpack_32_1x128 (mask); + __m128i s = unpack_32_1x128 (src); + __m128i d = unpack_32_1x128 (dst); - __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 ( - a, expand_alpha_1x64 (s))); - __m64 dest = pix_multiply_1x64 (s, a); - __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d)); + __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 ( + a, expand_alpha_1x128 (s))); + __m128i dest = pix_multiply_1x128 (s, a); + __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d)); - return pack_1x64_32 (pix_add_multiply_1x64 (&d, + return pack_1x128_32 (pix_add_multiply_1x128 (&d, &alpha_dst, &dest, &alpha_src)); @@ -2317,10 +2411,10 @@ core_combine_add_ca_sse2 (uint32_t * pd, m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s), - unpack_32_1x64 (m)), - unpack_32_1x64 (d))); + *pd++ = pack_1x128_32 ( + _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), + unpack_32_1x128 (m)), + unpack_32_1x128 (d))); w--; } @@ -2355,10 +2449,10 @@ core_combine_add_ca_sse2 (uint32_t * pd, m = *pm++; d = *pd; - *pd++ = pack_1x64_32 ( - _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s), - unpack_32_1x64 (m)), - unpack_32_1x64 (d))); + *pd++ = pack_1x128_32 ( + _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), + unpack_32_1x128 (m)), + unpack_32_1x128 (d))); w--; } } @@ -2711,9 +2805,9 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp, while (w && (unsigned long)dst & 15) { d = *dst; - *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src), - _mm_movepi64_pi64 (xmm_alpha), - unpack_32_1x64 (d))); + *dst++ = pack_1x128_32 (over_1x128 (xmm_src, + xmm_alpha, + unpack_32_1x128 (d))); w--; } @@ -2738,9 +2832,9 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp, while (w) { d = *dst; - *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src), - _mm_movepi64_pi64 (xmm_alpha), - unpack_32_1x64 (d))); + *dst++ = pack_1x128_32 (over_1x128 (xmm_src, + xmm_alpha, + unpack_32_1x128 (d))); w--; } @@ -2796,9 +2890,9 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp, d = *dst; *dst++ = pack_565_32_16 ( - pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src), - _mm_movepi64_pi64 (xmm_alpha), - expand565_16_1x64 (d)))); + pack_1x128_32 (over_1x128 (xmm_src, + xmm_alpha, + expand565_16_1x128 (d)))); w--; } @@ -2829,9 +2923,8 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp, { d = *dst; *dst++ = pack_565_32_16 ( - pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src), - _mm_movepi64_pi64 (xmm_alpha), - expand565_16_1x64 (d)))); + pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha, + expand565_16_1x128 (d)))); } } @@ -2866,7 +2959,7 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, __m128i xmm_dst; __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; - __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; + __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); srca = src >> 24; @@ -2882,8 +2975,8 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, xmm_src = _mm_unpacklo_epi8 ( create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); xmm_alpha = expand_alpha_1x128 (xmm_src); - mmx_src = _mm_movepi64_pi64 (xmm_src); - mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); + mmx_src = xmm_src; + mmx_alpha = xmm_alpha; while (height--) { @@ -2902,11 +2995,11 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, { d = *pd; - mmx_mask = unpack_32_1x64 (m); - mmx_dest = unpack_32_1x64 (d); + mmx_mask = unpack_32_1x128 (m); + mmx_dest = unpack_32_1x128 (d); - *pd = pack_1x64_32 ( - _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest)); + *pd = pack_1x128_32 ( + _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest)); } pd++; @@ -2950,11 +3043,11 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, { d = *pd; - mmx_mask = unpack_32_1x64 (m); - mmx_dest = unpack_32_1x64 (d); + mmx_mask = unpack_32_1x128 (m); + mmx_dest = unpack_32_1x128 (d); - *pd = pack_1x64_32 ( - _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest)); + *pd = pack_1x128_32 ( + _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), mmx_dest)); } pd++; @@ -2994,7 +3087,7 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; - __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; + __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); @@ -3009,8 +3102,8 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, xmm_src = _mm_unpacklo_epi8 ( create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); xmm_alpha = expand_alpha_1x128 (xmm_src); - mmx_src = _mm_movepi64_pi64 (xmm_src); - mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); + mmx_src = xmm_src; + mmx_alpha = xmm_alpha; while (height--) { @@ -3028,10 +3121,10 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, if (m) { d = *pd; - mmx_mask = unpack_32_1x64 (m); - mmx_dest = unpack_32_1x64 (d); + mmx_mask = unpack_32_1x128 (m); + mmx_dest = unpack_32_1x128 (d); - *pd = pack_1x64_32 (in_over_1x64 (&mmx_src, + *pd = pack_1x128_32 (in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); @@ -3078,11 +3171,11 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, if (m) { d = *pd; - mmx_mask = unpack_32_1x64 (m); - mmx_dest = unpack_32_1x64 (d); + mmx_mask = unpack_32_1x128 (m); + mmx_dest = unpack_32_1x128 (d); - *pd = pack_1x64_32 ( - in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); + *pd = pack_1x128_32 ( + in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); } pd++; @@ -3148,13 +3241,13 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, { uint32_t d = *dst; - __m64 ms = unpack_32_1x64 (s); - __m64 alpha = expand_alpha_1x64 (ms); - __m64 dest = _mm_movepi64_pi64 (xmm_mask); - __m64 alpha_dst = unpack_32_1x64 (d); + __m128i ms = unpack_32_1x128 (s); + __m128i alpha = expand_alpha_1x128 (ms); + __m128i dest = xmm_mask; + __m128i alpha_dst = unpack_32_1x128 (d); - *dst = pack_1x64_32 ( - in_over_1x64 (&ms, &alpha, &dest, &alpha_dst)); + *dst = pack_1x128_32 ( + in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); } dst++; w--; @@ -3195,13 +3288,13 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, { uint32_t d = *dst; - __m64 ms = unpack_32_1x64 (s); - __m64 alpha = expand_alpha_1x64 (ms); - __m64 mask = _mm_movepi64_pi64 (xmm_mask); - __m64 dest = unpack_32_1x64 (d); + __m128i ms = unpack_32_1x128 (s); + __m128i alpha = expand_alpha_1x128 (ms); + __m128i mask = xmm_mask; + __m128i dest = unpack_32_1x128 (d); - *dst = pack_1x64_32 ( - in_over_1x64 (&ms, &alpha, &mask, &dest)); + *dst = pack_1x128_32 ( + in_over_1x128 (&ms, &alpha, &mask, &dest)); } dst++; @@ -3336,13 +3429,13 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, uint32_t s = (*src++) | 0xff000000; uint32_t d = *dst; - __m64 src = unpack_32_1x64 (s); - __m64 alpha = _mm_movepi64_pi64 (xmm_alpha); - __m64 mask = _mm_movepi64_pi64 (xmm_mask); - __m64 dest = unpack_32_1x64 (d); + __m128i src = unpack_32_1x128 (s); + __m128i alpha = xmm_alpha; + __m128i mask = xmm_mask; + __m128i dest = unpack_32_1x128 (d); - *dst++ = pack_1x64_32 ( - in_over_1x64 (&src, &alpha, &mask, &dest)); + *dst++ = pack_1x128_32 ( + in_over_1x128 (&src, &alpha, &mask, &dest)); w--; } @@ -3375,13 +3468,13 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, uint32_t s = (*src++) | 0xff000000; uint32_t d = *dst; - __m64 src = unpack_32_1x64 (s); - __m64 alpha = _mm_movepi64_pi64 (xmm_alpha); - __m64 mask = _mm_movepi64_pi64 (xmm_mask); - __m64 dest = unpack_32_1x64 (d); + __m128i src = unpack_32_1x128 (s); + __m128i alpha = xmm_alpha; + __m128i mask = xmm_mask; + __m128i dest = unpack_32_1x128 (d); - *dst++ = pack_1x64_32 ( - in_over_1x64 (&src, &alpha, &mask, &dest)); + *dst++ = pack_1x128_32 ( + in_over_1x128 (&src, &alpha, &mask, &dest)); w--; } @@ -3436,13 +3529,13 @@ sse2_composite_over_8888_8888 (pixman_implementation_t *imp, static force_inline uint16_t composite_over_8888_0565pixel (uint32_t src, uint16_t dst) { - __m64 ms; + __m128i ms; - ms = unpack_32_1x64 (src); + ms = unpack_32_1x128 (src); return pack_565_32_16 ( - pack_1x64_32 ( - over_1x64 ( - ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst)))); + pack_1x128_32 ( + over_1x128 ( + ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst)))); } static void @@ -3588,7 +3681,7 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; - __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; + __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); @@ -3604,8 +3697,8 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, xmm_def = create_mask_2x32_128 (src, src); xmm_src = expand_pixel_32_1x128 (src); xmm_alpha = expand_alpha_1x128 (xmm_src); - mmx_src = _mm_movepi64_pi64 (xmm_src); - mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); + mmx_src = xmm_src; + mmx_alpha = xmm_alpha; while (height--) { @@ -3622,10 +3715,10 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, if (m) { d = *dst; - mmx_mask = expand_pixel_8_1x64 (m); - mmx_dest = unpack_32_1x64 (d); + mmx_mask = expand_pixel_8_1x128 (m); + mmx_dest = unpack_32_1x128 (d); - *dst = pack_1x64_32 (in_over_1x64 (&mmx_src, + *dst = pack_1x128_32 (in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); @@ -3677,10 +3770,10 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, if (m) { d = *dst; - mmx_mask = expand_pixel_8_1x64 (m); - mmx_dest = unpack_32_1x64 (d); + mmx_mask = expand_pixel_8_1x128 (m); + mmx_dest = unpack_32_1x128 (d); - *dst = pack_1x64_32 (in_over_1x64 (&mmx_src, + *dst = pack_1x128_32 (in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); @@ -3907,9 +4000,8 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, if (m) { - *dst = pack_1x64_32 ( - pix_multiply_1x64 ( - _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m))); + *dst = pack_1x128_32 ( + pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m))); } else { @@ -3962,9 +4054,9 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, if (m) { - *dst = pack_1x64_32 ( - pix_multiply_1x64 ( - _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m))); + *dst = pack_1x128_32 ( + pix_multiply_1x128 ( + xmm_src, expand_pixel_8_1x128 (m))); } else { @@ -4004,7 +4096,7 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, int dst_stride, mask_stride; int32_t w; uint32_t m; - __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; + __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; __m128i xmm_src, xmm_alpha; __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; @@ -4023,8 +4115,8 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, xmm_src = expand_pixel_32_1x128 (src); xmm_alpha = expand_alpha_1x128 (xmm_src); - mmx_src = _mm_movepi64_pi64 (xmm_src); - mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); + mmx_src = xmm_src; + mmx_alpha = xmm_alpha; while (height--) { @@ -4041,12 +4133,12 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, if (m) { d = *dst; - mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m)); - mmx_dest = expand565_16_1x64 (d); + mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); + mmx_dest = expand565_16_1x128 (d); *dst = pack_565_32_16 ( - pack_1x64_32 ( - in_over_1x64 ( + pack_1x128_32 ( + in_over_1x128 ( &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); } @@ -4114,12 +4206,12 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, if (m) { d = *dst; - mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m)); - mmx_dest = expand565_16_1x64 (d); + mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); + mmx_dest = expand565_16_1x128 (d); *dst = pack_565_32_16 ( - pack_1x64_32 ( - in_over_1x64 ( + pack_1x128_32 ( + in_over_1x128 ( &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); } @@ -4156,7 +4248,7 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, int32_t w; uint32_t opaque, zero; - __m64 ms; + __m128i ms; __m128i xmm_src, xmm_src_lo, xmm_src_hi; __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; @@ -4187,11 +4279,11 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, s = *src++; d = *dst; - ms = unpack_32_1x64 (s); + ms = unpack_32_1x128 (s); *dst++ = pack_565_32_16 ( - pack_1x64_32 ( - over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d)))); + pack_1x128_32 ( + over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d)))); w--; } @@ -4253,11 +4345,11 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, s = *src++; d = *dst; - ms = unpack_32_1x64 (s); + ms = unpack_32_1x128 (s); *dst++ = pack_565_32_16 ( - pack_1x64_32 ( - over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d)))); + pack_1x128_32 ( + over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d)))); w--; } } @@ -4320,9 +4412,9 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, s = *src++; d = *dst; - *dst++ = pack_1x64_32 ( - over_rev_non_pre_1x64 ( - unpack_32_1x64 (s), unpack_32_1x64 (d))); + *dst++ = pack_1x128_32 ( + over_rev_non_pre_1x128 ( + unpack_32_1x128 (s), unpack_32_1x128 (d))); w--; } @@ -4367,9 +4459,9 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, s = *src++; d = *dst; - *dst++ = pack_1x64_32 ( - over_rev_non_pre_1x64 ( - unpack_32_1x64 (s), unpack_32_1x64 (d))); + *dst++ = pack_1x128_32 ( + over_rev_non_pre_1x128 ( + unpack_32_1x128 (s), unpack_32_1x128 (d))); w--; } @@ -4408,7 +4500,7 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; - __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; + __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format); @@ -4422,8 +4514,8 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, xmm_src = expand_pixel_32_1x128 (src); xmm_alpha = expand_alpha_1x128 (xmm_src); - mmx_src = _mm_movepi64_pi64 (xmm_src); - mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); + mmx_src = xmm_src; + mmx_alpha = xmm_alpha; while (height--) { @@ -4440,12 +4532,12 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, if (m) { d = *dst; - mmx_mask = unpack_32_1x64 (m); - mmx_dest = expand565_16_1x64 (d); + mmx_mask = unpack_32_1x128 (m); + mmx_dest = expand565_16_1x128 (d); *dst = pack_565_32_16 ( - pack_1x64_32 ( - in_over_1x64 ( + pack_1x128_32 ( + in_over_1x128 ( &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); } @@ -4509,12 +4601,12 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, if (m) { d = *dst; - mmx_mask = unpack_32_1x64 (m); - mmx_dest = expand565_16_1x64 (d); + mmx_mask = unpack_32_1x128 (m); + mmx_dest = expand565_16_1x128 (d); *dst = pack_565_32_16 ( - pack_1x64_32 ( - in_over_1x64 ( + pack_1x128_32 ( + in_over_1x128 ( &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); } @@ -4582,11 +4674,11 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp, m = (uint32_t) *mask++; d = (uint32_t) *dst; - *dst++ = (uint8_t) pack_1x64_32 ( - pix_multiply_1x64 ( - pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha), - unpack_32_1x64 (m)), - unpack_32_1x64 (d))); + *dst++ = (uint8_t) pack_1x128_32 ( + pix_multiply_1x128 ( + pix_multiply_1x128 (xmm_alpha, + unpack_32_1x128 (m)), + unpack_32_1x128 (d))); w--; } @@ -4619,11 +4711,11 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp, m = (uint32_t) *mask++; d = (uint32_t) *dst; - *dst++ = (uint8_t) pack_1x64_32 ( - pix_multiply_1x64 ( - pix_multiply_1x64 ( - _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)), - unpack_32_1x64 (d))); + *dst++ = (uint8_t) pack_1x128_32 ( + pix_multiply_1x128 ( + pix_multiply_1x128 ( + xmm_alpha, unpack_32_1x128 (m)), + unpack_32_1x128 (d))); w--; } } @@ -4689,10 +4781,10 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp, { d = (uint32_t) *dst; - *dst++ = (uint8_t) pack_1x64_32 ( - pix_multiply_1x64 ( - _mm_movepi64_pi64 (xmm_alpha), - unpack_32_1x64 (d))); + *dst++ = (uint8_t) pack_1x128_32 ( + pix_multiply_1x128 ( + xmm_alpha, + unpack_32_1x128 (d))); w--; } @@ -4717,10 +4809,10 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp, { d = (uint32_t) *dst; - *dst++ = (uint8_t) pack_1x64_32 ( - pix_multiply_1x64 ( - _mm_movepi64_pi64 (xmm_alpha), - unpack_32_1x64 (d))); + *dst++ = (uint8_t) pack_1x128_32 ( + pix_multiply_1x128 ( + xmm_alpha, + unpack_32_1x128 (d))); w--; } } @@ -4774,9 +4866,9 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp, s = (uint32_t) *src++; d = (uint32_t) *dst; - *dst++ = (uint8_t) pack_1x64_32 ( - pix_multiply_1x64 ( - unpack_32_1x64 (s), unpack_32_1x64 (d))); + *dst++ = (uint8_t) pack_1x128_32 ( + pix_multiply_1x128 ( + unpack_32_1x128 (s), unpack_32_1x128 (d))); w--; } @@ -4805,8 +4897,8 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp, s = (uint32_t) *src++; d = (uint32_t) *dst; - *dst++ = (uint8_t) pack_1x64_32 ( - pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d))); + *dst++ = (uint8_t) pack_1x128_32 ( + pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d))); w--; } } @@ -4869,11 +4961,11 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp, m = (uint32_t) *mask++; d = (uint32_t) *dst; - *dst++ = (uint8_t) pack_1x64_32 ( - _mm_adds_pu16 ( - pix_multiply_1x64 ( - _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)), - unpack_32_1x64 (d))); + *dst++ = (uint8_t) pack_1x128_32 ( + _mm_adds_epu16 ( + pix_multiply_1x128 ( + xmm_alpha, unpack_32_1x128 (m)), + unpack_32_1x128 (d))); w--; } @@ -4905,11 +4997,11 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp, m = (uint32_t) *mask++; d = (uint32_t) *dst; - *dst++ = (uint8_t) pack_1x64_32 ( - _mm_adds_pu16 ( - pix_multiply_1x64 ( - _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)), - unpack_32_1x64 (d))); + *dst++ = (uint8_t) pack_1x128_32 ( + _mm_adds_epu16 ( + pix_multiply_1x128 ( + xmm_alpha, unpack_32_1x128 (m)), + unpack_32_1x128 (d))); w--; } @@ -4973,10 +5065,10 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp, while (w && ((unsigned long)dst & 15)) { - *dst = (uint8_t)_mm_cvtsi64_si32 ( - _mm_adds_pu8 ( - _mm_movepi64_pi64 (xmm_src), - _mm_cvtsi32_si64 (*dst))); + *dst = (uint8_t)_mm_cvtsi128_si32 ( + _mm_adds_epu8 ( + xmm_src, + _mm_cvtsi32_si128 (*dst))); w--; dst++; @@ -4993,10 +5085,10 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp, while (w) { - *dst = (uint8_t)_mm_cvtsi64_si32 ( - _mm_adds_pu8 ( - _mm_movepi64_pi64 (xmm_src), - _mm_cvtsi32_si64 (*dst))); + *dst = (uint8_t)_mm_cvtsi128_si32 ( + _mm_adds_epu8 ( + xmm_src, + _mm_cvtsi32_si128 (*dst))); w--; dst++; @@ -5284,7 +5376,7 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, uint32_t m; int src_stride, mask_stride, dst_stride; int32_t w; - __m64 ms; + __m128i ms; __m128i xmm_src, xmm_src_lo, xmm_src_hi; __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; @@ -5313,17 +5405,17 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, s = 0xff000000 | *src++; m = (uint32_t) *mask++; d = *dst; - ms = unpack_32_1x64 (s); + ms = unpack_32_1x128 (s); if (m != 0xff) { - __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m)); - __m64 md = unpack_32_1x64 (d); + __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); + __m128i md = unpack_32_1x128 (d); - ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md); + ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md); } - *dst++ = pack_1x64_32 (ms); + *dst++ = pack_1x128_32 (ms); w--; } @@ -5373,15 +5465,15 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, } else { - __m64 ma, md, ms; + __m128i ma, md, ms; d = *dst; - ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m)); - md = unpack_32_1x64 (d); - ms = unpack_32_1x64 (s); + ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); + md = unpack_32_1x128 (d); + ms = unpack_32_1x128 (s); - *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md)); + *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md)); } } @@ -5457,15 +5549,15 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, } else { - __m64 ms, md, ma, msa; + __m128i ms, md, ma, msa; - ma = expand_alpha_rev_1x64 (load_32_1x64 (m)); - ms = unpack_32_1x64 (s); - md = unpack_32_1x64 (d); + ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); + ms = unpack_32_1x128 (s); + md = unpack_32_1x128 (d); - msa = expand_alpha_rev_1x64 (load_32_1x64 (sa)); + msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); - *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md)); + *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); } } @@ -5529,15 +5621,15 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, } else { - __m64 ms, md, ma, msa; + __m128i ms, md, ma, msa; - ma = expand_alpha_rev_1x64 (load_32_1x64 (m)); - ms = unpack_32_1x64 (s); - md = unpack_32_1x64 (d); + ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); + ms = unpack_32_1x128 (s); + md = unpack_32_1x128 (d); - msa = expand_alpha_rev_1x64 (load_32_1x64 (sa)); + msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); - *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md)); + *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); } } @@ -5591,12 +5683,12 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp, while (w && (unsigned long)dst & 15) { - __m64 vd; + __m128i vd; - vd = unpack_32_1x64 (*dst); + vd = unpack_32_1x128 (*dst); - *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd), - _mm_movepi64_pi64 (xmm_src))); + *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), + xmm_src)); w--; dst++; } @@ -5626,12 +5718,12 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp, while (w) { - __m64 vd; + __m128i vd; - vd = unpack_32_1x64 (*dst); + vd = unpack_32_1x128 (*dst); - *dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd), - _mm_movepi64_pi64 (xmm_src))); + *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), + xmm_src)); w--; dst++; } @@ -5703,15 +5795,15 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp, } else { - __m64 ms, md, ma, msa; + __m128i ms, md, ma, msa; - ma = expand_alpha_rev_1x64 (load_32_1x64 (m)); - ms = unpack_32_1x64 (s); - md = unpack_32_1x64 (d); + ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); + ms = unpack_32_1x128 (s); + md = unpack_32_1x128 (d); - msa = expand_alpha_rev_1x64 (load_32_1x64 (sa)); + msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); - *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md)); + *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); } } @@ -5773,15 +5865,15 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp, } else { - __m64 ms, md, ma, msa; + __m128i ms, md, ma, msa; - ma = expand_alpha_rev_1x64 (load_32_1x64 (m)); - ms = unpack_32_1x64 (s); - md = unpack_32_1x64 (d); + ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); + ms = unpack_32_1x128 (s); + md = unpack_32_1x128 (d); - msa = expand_alpha_rev_1x64 (load_32_1x64 (sa)); + msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); - *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md)); + *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); } } @@ -5927,13 +6019,13 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, { uint32_t d = *dst; - __m64 ms = unpack_32_1x64 (s); - __m64 alpha = expand_alpha_1x64 (ms); - __m64 dest = _mm_movepi64_pi64 (xmm_mask); - __m64 alpha_dst = unpack_32_1x64 (d); + __m128i ms = unpack_32_1x128 (s); + __m128i alpha = expand_alpha_1x128 (ms); + __m128i dest = xmm_mask; + __m128i alpha_dst = unpack_32_1x128 (d); - *dst = pack_1x64_32 ( - in_over_1x64 (&ms, &alpha, &dest, &alpha_dst)); + *dst = pack_1x128_32 ( + in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); } dst++; w--; @@ -5985,13 +6077,13 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, { uint32_t d = *dst; - __m64 ms = unpack_32_1x64 (s); - __m64 alpha = expand_alpha_1x64 (ms); - __m64 mask = _mm_movepi64_pi64 (xmm_mask); - __m64 dest = unpack_32_1x64 (d); + __m128i ms = unpack_32_1x128 (s); + __m128i alpha = expand_alpha_1x128 (ms); + __m128i mask = xmm_mask; + __m128i dest = unpack_32_1x128 (d); - *dst = pack_1x64_32 ( - in_over_1x64 (&ms, &alpha, &mask, &dest)); + *dst = pack_1x128_32 ( + in_over_1x128 (&ms, &alpha, &mask, &dest)); } dst++; -- 2.7.4