#include <config.h>
#endif
-#include <mmintrin.h>
#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
#include <emmintrin.h> /* for SSE2 intrinsics */
#include "pixman-private.h"
#include "pixman-combine32.h"
-
-#if defined(_MSC_VER) && defined(_M_AMD64)
-/* Windows 64 doesn't allow MMX to be used, so
- * the pixman-x64-mmx-emulation.h file contains
- * implementations of those MMX intrinsics that
- * are used in the SSE2 implementation.
- */
-# include "pixman-x64-mmx-emulation.h"
-#endif
-
-#ifdef USE_SSE2
-
-/* --------------------------------------------------------------------
- * Locals
- */
-
-static __m64 mask_x0080;
-static __m64 mask_x00ff;
-static __m64 mask_x0101;
-static __m64 mask_x_alpha;
-
-static __m64 mask_x565_rgb;
-static __m64 mask_x565_unpack;
+#include "pixman-inlines.h"
static __m128i mask_0080;
static __m128i mask_00ff;
static __m128i mask_565_fix_rb;
static __m128i mask_565_fix_g;
-/* ----------------------------------------------------------------------
- * SSE2 Inlines
- */
static force_inline __m128i
unpack_32_1x128 (uint32_t data)
{
over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
}
-static force_inline void
-cache_prefetch (__m128i* addr)
-{
- _mm_prefetch ((void const*)addr, _MM_HINT_T0);
-}
-
-static force_inline void
-cache_prefetch_next (__m128i* addr)
-{
- _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */
-}
-
-/* prefetching NULL is very slow on some systems. don't do that. */
-
-static force_inline void
-maybe_prefetch (__m128i* addr)
-{
- if (addr)
- cache_prefetch (addr);
-}
-
-static force_inline void
-maybe_prefetch_next (__m128i* addr)
-{
- if (addr)
- cache_prefetch_next (addr);
-}
-
/* load 4 pixels from a 16-byte boundary aligned address */
static force_inline __m128i
load_128_aligned (__m128i* src)
_mm_storeu_si128 (dst, data);
}
-/* ------------------------------------------------------------------
- * MMX inlines
- */
-
-static force_inline __m64
-load_32_1x64 (uint32_t data)
-{
- return _mm_cvtsi32_si64 (data);
-}
-
-static force_inline __m64
-unpack_32_1x64 (uint32_t data)
-{
- return _mm_unpacklo_pi8 (load_32_1x64 (data), _mm_setzero_si64 ());
-}
-
-static force_inline __m64
-expand_alpha_1x64 (__m64 data)
+static force_inline __m128i
+load_32_1x128 (uint32_t data)
{
- return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
+ return _mm_cvtsi32_si128 (data);
}
-static force_inline __m64
-expand_alpha_rev_1x64 (__m64 data)
+static force_inline __m128i
+expand_alpha_rev_1x128 (__m128i data)
{
- return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
+ return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
}
-static force_inline __m64
-expand_pixel_8_1x64 (uint8_t data)
+static force_inline __m128i
+expand_pixel_8_1x128 (uint8_t data)
{
- return _mm_shuffle_pi16 (
- unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
+ return _mm_shufflelo_epi16 (
+ unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
}
-static force_inline __m64
-pix_multiply_1x64 (__m64 data,
- __m64 alpha)
+static force_inline __m128i
+pix_multiply_1x128 (__m128i data,
+ __m128i alpha)
{
- return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
- mask_x0080),
- mask_x0101);
+ return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
+ mask_0080),
+ mask_0101);
}
-static force_inline __m64
-pix_add_multiply_1x64 (__m64* src,
- __m64* alpha_dst,
- __m64* dst,
- __m64* alpha_src)
+static force_inline __m128i
+pix_add_multiply_1x128 (__m128i* src,
+ __m128i* alpha_dst,
+ __m128i* dst,
+ __m128i* alpha_src)
{
- __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
- __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
+ __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
+ __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
- return _mm_adds_pu8 (t1, t2);
+ return _mm_adds_epu8 (t1, t2);
}
-static force_inline __m64
-negate_1x64 (__m64 data)
+static force_inline __m128i
+negate_1x128 (__m128i data)
{
- return _mm_xor_si64 (data, mask_x00ff);
+ return _mm_xor_si128 (data, mask_00ff);
}
-static force_inline __m64
-invert_colors_1x64 (__m64 data)
+static force_inline __m128i
+invert_colors_1x128 (__m128i data)
{
- return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
+ return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
}
-static force_inline __m64
-over_1x64 (__m64 src, __m64 alpha, __m64 dst)
+static force_inline __m128i
+over_1x128 (__m128i src, __m128i alpha, __m128i dst)
{
- return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
+ return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
}
-static force_inline __m64
-in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
+static force_inline __m128i
+in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
{
- return over_1x64 (pix_multiply_1x64 (*src, *mask),
- pix_multiply_1x64 (*alpha, *mask),
- *dst);
+ return over_1x128 (pix_multiply_1x128 (*src, *mask),
+ pix_multiply_1x128 (*alpha, *mask),
+ *dst);
}
-static force_inline __m64
-over_rev_non_pre_1x64 (__m64 src, __m64 dst)
+static force_inline __m128i
+over_rev_non_pre_1x128 (__m128i src, __m128i dst)
{
- __m64 alpha = expand_alpha_1x64 (src);
+ __m128i alpha = expand_alpha_1x128 (src);
- return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
- _mm_or_si64 (alpha, mask_x_alpha)),
- alpha,
- dst);
+ return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
+ _mm_or_si128 (alpha, mask_alpha)),
+ alpha,
+ dst);
}
static force_inline uint32_t
-pack_1x64_32 (__m64 data)
+pack_1x128_32 (__m128i data)
{
- return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
+ return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
}
-/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
- *
- * 00RR00GG00BB
- *
- * --- Expanding 565 in the low word ---
- *
- * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
- * m = m & (01f0003f001f);
- * m = m * (008404100840);
- * m = m >> 8;
- *
- * Note the trick here - the top word is shifted by another nibble to
- * avoid it bumping into the middle word
- */
-static force_inline __m64
-expand565_16_1x64 (uint16_t pixel)
+static force_inline __m128i
+expand565_16_1x128 (uint16_t pixel)
{
- __m64 p;
- __m64 t1, t2;
-
- p = _mm_cvtsi32_si64 ((uint32_t) pixel);
-
- t1 = _mm_slli_si64 (p, 36 - 11);
- t2 = _mm_slli_si64 (p, 16 - 5);
+ __m128i m = _mm_cvtsi32_si128 (pixel);
- p = _mm_or_si64 (t1, p);
- p = _mm_or_si64 (t2, p);
- p = _mm_and_si64 (p, mask_x565_rgb);
- p = _mm_mullo_pi16 (p, mask_x565_unpack);
+ m = unpack_565_to_8888 (m);
- return _mm_srli_pi16 (p, 8);
+ return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
}
-/* ----------------------------------------------------------------------------
- * Compose Core transformations
- */
static force_inline uint32_t
core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
{
uint8_t a;
- __m64 ms;
+ __m128i xmms;
a = src >> 24;
}
else if (src)
{
- ms = unpack_32_1x64 (src);
- return pack_1x64_32 (
- over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
+ xmms = unpack_32_1x128 (src);
+ return pack_1x128_32 (
+ over_1x128 (xmms, expand_alpha_1x128 (xmms),
+ unpack_32_1x128 (dst)));
}
return dst;
if (pm)
{
- __m64 ms, mm;
+ __m128i ms, mm;
- mm = unpack_32_1x64 (*pm);
- mm = expand_alpha_1x64 (mm);
+ mm = unpack_32_1x128 (*pm);
+ mm = expand_alpha_1x128 (mm);
- ms = unpack_32_1x64 (s);
- ms = pix_multiply_1x64 (ms, mm);
+ ms = unpack_32_1x128 (s);
+ ms = pix_multiply_1x128 (ms, mm);
- s = pack_1x64_32 (ms);
+ s = pack_1x128_32 (ms);
}
return s;
}
static force_inline void
-core_combine_over_u_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t* pm,
- int w)
+core_combine_over_u_sse2_mask (uint32_t * pd,
+ const uint32_t* ps,
+ const uint32_t* pm,
+ int w)
{
uint32_t s, d;
- __m128i xmm_dst_lo, xmm_dst_hi;
- __m128i xmm_src_lo, xmm_src_hi;
- __m128i xmm_alpha_lo, xmm_alpha_hi;
-
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
/* Align dst on a 16-byte boundary */
while (w && ((unsigned long)pd & 15))
{
d = *pd;
s = combine1 (ps, pm);
- *pd++ = core_combine_over_u_pixel_sse2 (s, d);
+ if (s)
+ *pd = core_combine_over_u_pixel_sse2 (s, d);
+ pd++;
ps++;
- if (pm)
- pm++;
+ pm++;
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
- /* I'm loading unaligned because I'm not sure about
- * the address alignment.
- */
- xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+ __m128i mask = load_128_unaligned ((__m128i *)pm);
- if (is_opaque (xmm_src_hi))
- {
- save_128_aligned ((__m128i*)pd, xmm_src_hi);
- }
- else if (!is_zero (xmm_src_hi))
+ if (!is_zero (mask))
{
- xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+ __m128i src;
+ __m128i src_hi, src_lo;
+ __m128i mask_hi, mask_lo;
+ __m128i alpha_hi, alpha_lo;
- unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ src = load_128_unaligned ((__m128i *)ps);
- expand_alpha_2x128 (
- xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+ if (is_opaque (_mm_and_si128 (src, mask)))
+ {
+ save_128_aligned ((__m128i *)pd, src);
+ }
+ else
+ {
+ __m128i dst = load_128_aligned ((__m128i *)pd);
+ __m128i dst_hi, dst_lo;
- over_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (mask, &mask_lo, &mask_hi);
+ unpack_128_2x128 (src, &src_lo, &src_hi);
- /* rebuid the 4 pixel data and save*/
- save_128_aligned ((__m128i*)pd,
- pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
+ pix_multiply_2x128 (&src_lo, &src_hi,
+ &mask_lo, &mask_hi,
+ &src_lo, &src_hi);
+
+ unpack_128_2x128 (dst, &dst_lo, &dst_hi);
+
+ expand_alpha_2x128 (src_lo, src_hi,
+ &alpha_lo, &alpha_hi);
+
+ over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
+ &dst_lo, &dst_hi);
+
+ save_128_aligned (
+ (__m128i *)pd,
+ pack_2x128_128 (dst_lo, dst_hi));
+ }
}
- w -= 4;
+ pm += 4;
ps += 4;
pd += 4;
- if (pm)
- pm += 4;
+ w -= 4;
}
-
while (w)
{
d = *pd;
s = combine1 (ps, pm);
- *pd++ = core_combine_over_u_pixel_sse2 (s, d);
+ if (s)
+ *pd = core_combine_over_u_pixel_sse2 (s, d);
+ pd++;
+ ps++;
+ pm++;
+
+ w--;
+ }
+}
+
+static force_inline void
+core_combine_over_u_sse2_no_mask (uint32_t * pd,
+ const uint32_t* ps,
+ int w)
+{
+ uint32_t s, d;
+
+ /* Align dst on a 16-byte boundary */
+ while (w && ((unsigned long)pd & 15))
+ {
+ d = *pd;
+ s = *ps;
+
+ if (s)
+ *pd = core_combine_over_u_pixel_sse2 (s, d);
+ pd++;
+ ps++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ __m128i src;
+ __m128i src_hi, src_lo, dst_hi, dst_lo;
+ __m128i alpha_hi, alpha_lo;
+
+ src = load_128_unaligned ((__m128i *)ps);
+
+ if (!is_zero (src))
+ {
+ if (is_opaque (src))
+ {
+ save_128_aligned ((__m128i *)pd, src);
+ }
+ else
+ {
+ __m128i dst = load_128_aligned ((__m128i *)pd);
+
+ unpack_128_2x128 (src, &src_lo, &src_hi);
+ unpack_128_2x128 (dst, &dst_lo, &dst_hi);
+
+ expand_alpha_2x128 (src_lo, src_hi,
+ &alpha_lo, &alpha_hi);
+ over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
+ &dst_lo, &dst_hi);
+
+ save_128_aligned (
+ (__m128i *)pd,
+ pack_2x128_128 (dst_lo, dst_hi));
+ }
+ }
+
+ ps += 4;
+ pd += 4;
+ w -= 4;
+ }
+ while (w)
+ {
+ d = *pd;
+ s = *ps;
+
+ if (s)
+ *pd = core_combine_over_u_pixel_sse2 (s, d);
+ pd++;
ps++;
- if (pm)
- pm++;
w--;
}
}
static force_inline void
-core_combine_over_reverse_u_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t* pm,
- int w)
+sse2_combine_over_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
+{
+ if (pm)
+ core_combine_over_u_sse2_mask (pd, ps, pm, w);
+ else
+ core_combine_over_u_sse2_no_mask (pd, ps, w);
+}
+
+static void
+sse2_combine_over_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, d;
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_alpha_lo, xmm_alpha_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
/* Align dst on a 16-byte boundary */
while (w &&
((unsigned long)pd & 15))
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
/* I'm loading unaligned because I'm not sure
* about the address alignment.
*/
}
static force_inline uint32_t
-core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
+core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
{
uint32_t maska = src >> 24;
}
else if (maska != 0xff)
{
- return pack_1x64_32 (
- pix_multiply_1x64 (unpack_32_1x64 (dst),
- expand_alpha_1x64 (unpack_32_1x64 (src))));
+ return pack_1x128_32 (
+ pix_multiply_1x128 (unpack_32_1x128 (dst),
+ expand_alpha_1x128 (unpack_32_1x128 (src))));
}
return dst;
}
-static force_inline void
-core_combine_in_u_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t* pm,
- int w)
+static void
+sse2_combine_in_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, d;
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
d = *pd;
- *pd++ = core_combine_in_u_pixelsse2 (d, s);
+ *pd++ = core_combine_in_u_pixel_sse2 (d, s);
w--;
ps++;
if (pm)
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
s = combine1 (ps, pm);
d = *pd;
- *pd++ = core_combine_in_u_pixelsse2 (d, s);
+ *pd++ = core_combine_in_u_pixel_sse2 (d, s);
w--;
ps++;
if (pm)
}
}
-static force_inline void
-core_combine_reverse_in_u_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t *pm,
- int w)
+static void
+sse2_combine_in_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, d;
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
d = *pd;
- *pd++ = core_combine_in_u_pixelsse2 (s, d);
+ *pd++ = core_combine_in_u_pixel_sse2 (s, d);
ps++;
w--;
if (pm)
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
s = combine1 (ps, pm);
d = *pd;
- *pd++ = core_combine_in_u_pixelsse2 (s, d);
+ *pd++ = core_combine_in_u_pixel_sse2 (s, d);
w--;
ps++;
if (pm)
}
}
-static force_inline void
-core_combine_reverse_out_u_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t* pm,
- int w)
+static void
+sse2_combine_out_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
uint32_t s = combine1 (ps, pm);
uint32_t d = *pd;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (d), negate_1x64 (
- expand_alpha_1x64 (unpack_32_1x64 (s)))));
-
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (d), negate_1x128 (
+ expand_alpha_1x128 (unpack_32_1x128 (s)))));
+
if (pm)
pm++;
ps++;
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
uint32_t s = combine1 (ps, pm);
uint32_t d = *pd;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (d), negate_1x64 (
- expand_alpha_1x64 (unpack_32_1x64 (s)))));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (d), negate_1x128 (
+ expand_alpha_1x128 (unpack_32_1x128 (s)))));
ps++;
if (pm)
pm++;
}
}
-static force_inline void
-core_combine_out_u_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t* pm,
- int w)
+static void
+sse2_combine_out_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
uint32_t s = combine1 (ps, pm);
uint32_t d = *pd;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (s), negate_1x64 (
- expand_alpha_1x64 (unpack_32_1x64 (d)))));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (s), negate_1x128 (
+ expand_alpha_1x128 (unpack_32_1x128 (d)))));
w--;
ps++;
if (pm)
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
uint32_t s = combine1 (ps, pm);
uint32_t d = *pd;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (s), negate_1x64 (
- expand_alpha_1x64 (unpack_32_1x64 (d)))));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (s), negate_1x128 (
+ expand_alpha_1x128 (unpack_32_1x128 (d)))));
w--;
ps++;
if (pm)
core_combine_atop_u_pixel_sse2 (uint32_t src,
uint32_t dst)
{
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
+ __m128i s = unpack_32_1x128 (src);
+ __m128i d = unpack_32_1x128 (dst);
- __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
- __m64 da = expand_alpha_1x64 (d);
+ __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
+ __m128i da = expand_alpha_1x128 (d);
- return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
+ return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
}
-static force_inline void
-core_combine_atop_u_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t* pm,
- int w)
+static void
+sse2_combine_atop_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, d;
__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
uint32_t dst)
{
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
+ __m128i s = unpack_32_1x128 (src);
+ __m128i d = unpack_32_1x128 (dst);
- __m64 sa = expand_alpha_1x64 (s);
- __m64 da = negate_1x64 (expand_alpha_1x64 (d));
+ __m128i sa = expand_alpha_1x128 (s);
+ __m128i da = negate_1x128 (expand_alpha_1x128 (d));
- return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
+ return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
}
-static force_inline void
-core_combine_reverse_atop_u_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t* pm,
- int w)
+static void
+sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, d;
__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
core_combine_xor_u_pixel_sse2 (uint32_t src,
uint32_t dst)
{
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
+ __m128i s = unpack_32_1x128 (src);
+ __m128i d = unpack_32_1x128 (dst);
- __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
- __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
+ __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
+ __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
- return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
+ return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
}
-static force_inline void
-core_combine_xor_u_sse2 (uint32_t* dst,
- const uint32_t* src,
- const uint32_t *mask,
- int width)
+static void
+sse2_combine_xor_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
{
int w = width;
uint32_t s, d;
__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
xmm_dst = load_128_aligned ((__m128i*) pd);
}
static force_inline void
-core_combine_add_u_sse2 (uint32_t* dst,
- const uint32_t* src,
- const uint32_t* mask,
- int width)
+sse2_combine_add_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
{
int w = width;
uint32_t s, d;
const uint32_t* ps = src;
const uint32_t* pm = mask;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = combine1 (ps, pm);
ps++;
if (pm)
pm++;
- *pd++ = _mm_cvtsi64_si32 (
- _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
+ *pd++ = _mm_cvtsi128_si32 (
+ _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
__m128i s;
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
s = combine4 ((__m128i*)ps, (__m128i*)pm);
save_128_aligned (
d = *pd;
ps++;
- *pd++ = _mm_cvtsi64_si32 (
- _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
+ *pd++ = _mm_cvtsi128_si32 (
+ _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
if (pm)
pm++;
}
core_combine_saturate_u_pixel_sse2 (uint32_t src,
uint32_t dst)
{
- __m64 ms = unpack_32_1x64 (src);
- __m64 md = unpack_32_1x64 (dst);
+ __m128i ms = unpack_32_1x128 (src);
+ __m128i md = unpack_32_1x128 (dst);
uint32_t sa = src >> 24;
uint32_t da = ~dst >> 24;
if (sa > da)
{
- ms = pix_multiply_1x64 (
- ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
+ ms = pix_multiply_1x128 (
+ ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
}
- return pack_1x64_32 (_mm_adds_pu16 (md, ms));
+ return pack_1x128_32 (_mm_adds_epu16 (md, ms));
}
-static force_inline void
-core_combine_saturate_u_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
+static void
+sse2_combine_saturate_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, d;
uint32_t pack_cmp;
__m128i xmm_src, xmm_dst;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = combine1 (ps, pm);
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_dst = load_128_aligned ((__m128i*)pd);
xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
}
}
-static force_inline void
-core_combine_src_ca_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t *pm,
- int w)
+static void
+sse2_combine_src_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, m;
__m128i xmm_mask_lo, xmm_mask_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
m = *pm++;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
{
s = *ps++;
m = *pm++;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
w--;
}
}
uint32_t mask,
uint32_t dst)
{
- __m64 s = unpack_32_1x64 (src);
- __m64 expAlpha = expand_alpha_1x64 (s);
- __m64 unpk_mask = unpack_32_1x64 (mask);
- __m64 unpk_dst = unpack_32_1x64 (dst);
+ __m128i s = unpack_32_1x128 (src);
+ __m128i expAlpha = expand_alpha_1x128 (s);
+ __m128i unpk_mask = unpack_32_1x128 (mask);
+ __m128i unpk_dst = unpack_32_1x128 (dst);
- return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
+ return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
}
-static force_inline void
-core_combine_over_ca_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t *pm,
- int w)
+static void
+sse2_combine_over_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, m, d;
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
uint32_t mask,
uint32_t dst)
{
- __m64 d = unpack_32_1x64 (dst);
+ __m128i d = unpack_32_1x128 (dst);
- return pack_1x64_32 (
- over_1x64 (d, expand_alpha_1x64 (d),
- pix_multiply_1x64 (unpack_32_1x64 (src),
- unpack_32_1x64 (mask))));
+ return pack_1x128_32 (
+ over_1x128 (d, expand_alpha_1x128 (d),
+ pix_multiply_1x128 (unpack_32_1x128 (src),
+ unpack_32_1x128 (mask))));
}
-static force_inline void
-core_combine_over_reverse_ca_sse2 (uint32_t* pd,
- const uint32_t* ps,
- const uint32_t *pm,
- int w)
+static void
+sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, m, d;
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
}
}
-static force_inline void
-core_combine_in_ca_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
+static void
+sse2_combine_in_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, m, d;
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
m = *pm++;
d = *pd;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
- expand_alpha_1x64 (unpack_32_1x64 (d))));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
+ expand_alpha_1x128 (unpack_32_1x128 (d))));
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
m = *pm++;
d = *pd;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- pix_multiply_1x64 (
- unpack_32_1x64 (s), unpack_32_1x64 (m)),
- expand_alpha_1x64 (unpack_32_1x64 (d))));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (s), unpack_32_1x128 (m)),
+ expand_alpha_1x128 (unpack_32_1x128 (d))));
w--;
}
}
-static force_inline void
-core_combine_in_reverse_ca_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
-{
+static void
+sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
+{
uint32_t s, m, d;
__m128i xmm_alpha_lo, xmm_alpha_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
m = *pm++;
d = *pd;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (d),
- pix_multiply_1x64 (unpack_32_1x64 (m),
- expand_alpha_1x64 (unpack_32_1x64 (s)))));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (d),
+ pix_multiply_1x128 (unpack_32_1x128 (m),
+ expand_alpha_1x128 (unpack_32_1x128 (s)))));
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
m = *pm++;
d = *pd;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (d),
- pix_multiply_1x64 (unpack_32_1x64 (m),
- expand_alpha_1x64 (unpack_32_1x64 (s)))));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (d),
+ pix_multiply_1x128 (unpack_32_1x128 (m),
+ expand_alpha_1x128 (unpack_32_1x128 (s)))));
w--;
}
}
-static force_inline void
-core_combine_out_ca_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
+static void
+sse2_combine_out_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, m, d;
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
m = *pm++;
d = *pd;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- pix_multiply_1x64 (
- unpack_32_1x64 (s), unpack_32_1x64 (m)),
- negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (s), unpack_32_1x128 (m)),
+ negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
m = *pm++;
d = *pd;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- pix_multiply_1x64 (
- unpack_32_1x64 (s), unpack_32_1x64 (m)),
- negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (s), unpack_32_1x128 (m)),
+ negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
w--;
}
}
-static force_inline void
-core_combine_out_reverse_ca_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
+static void
+sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, m, d;
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
m = *pm++;
d = *pd;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (d),
- negate_1x64 (pix_multiply_1x64 (
- unpack_32_1x64 (m),
- expand_alpha_1x64 (unpack_32_1x64 (s))))));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (d),
+ negate_1x128 (pix_multiply_1x128 (
+ unpack_32_1x128 (m),
+ expand_alpha_1x128 (unpack_32_1x128 (s))))));
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
m = *pm++;
d = *pd;
- *pd++ = pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (d),
- negate_1x64 (pix_multiply_1x64 (
- unpack_32_1x64 (m),
- expand_alpha_1x64 (unpack_32_1x64 (s))))));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (d),
+ negate_1x128 (pix_multiply_1x128 (
+ unpack_32_1x128 (m),
+ expand_alpha_1x128 (unpack_32_1x128 (s))))));
w--;
}
}
uint32_t mask,
uint32_t dst)
{
- __m64 m = unpack_32_1x64 (mask);
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
- __m64 sa = expand_alpha_1x64 (s);
- __m64 da = expand_alpha_1x64 (d);
+ __m128i m = unpack_32_1x128 (mask);
+ __m128i s = unpack_32_1x128 (src);
+ __m128i d = unpack_32_1x128 (dst);
+ __m128i sa = expand_alpha_1x128 (s);
+ __m128i da = expand_alpha_1x128 (d);
- s = pix_multiply_1x64 (s, m);
- m = negate_1x64 (pix_multiply_1x64 (m, sa));
+ s = pix_multiply_1x128 (s, m);
+ m = negate_1x128 (pix_multiply_1x128 (m, sa));
- return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
+ return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
}
-static force_inline void
-core_combine_atop_ca_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
+static void
+sse2_combine_atop_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, m, d;
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
uint32_t mask,
uint32_t dst)
{
- __m64 m = unpack_32_1x64 (mask);
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
+ __m128i m = unpack_32_1x128 (mask);
+ __m128i s = unpack_32_1x128 (src);
+ __m128i d = unpack_32_1x128 (dst);
- __m64 da = negate_1x64 (expand_alpha_1x64 (d));
- __m64 sa = expand_alpha_1x64 (s);
+ __m128i da = negate_1x128 (expand_alpha_1x128 (d));
+ __m128i sa = expand_alpha_1x128 (s);
- s = pix_multiply_1x64 (s, m);
- m = pix_multiply_1x64 (m, sa);
+ s = pix_multiply_1x128 (s, m);
+ m = pix_multiply_1x128 (m, sa);
- return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
+ return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
}
-static force_inline void
-core_combine_reverse_atop_ca_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
+static void
+sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, m, d;
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
uint32_t mask,
uint32_t dst)
{
- __m64 a = unpack_32_1x64 (mask);
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
+ __m128i a = unpack_32_1x128 (mask);
+ __m128i s = unpack_32_1x128 (src);
+ __m128i d = unpack_32_1x128 (dst);
- __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
- a, expand_alpha_1x64 (s)));
- __m64 dest = pix_multiply_1x64 (s, a);
- __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
+ __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
+ a, expand_alpha_1x128 (s)));
+ __m128i dest = pix_multiply_1x128 (s, a);
+ __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
- return pack_1x64_32 (pix_add_multiply_1x64 (&d,
+ return pack_1x128_32 (pix_add_multiply_1x128 (&d,
&alpha_dst,
&dest,
&alpha_src));
}
-static force_inline void
-core_combine_xor_ca_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
+static void
+sse2_combine_xor_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, m, d;
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
}
}
-static force_inline void
-core_combine_add_ca_sse2 (uint32_t * pd,
- const uint32_t *ps,
- const uint32_t *pm,
- int w)
+static void
+sse2_combine_add_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, m, d;
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
m = *pm++;
d = *pd;
- *pd++ = pack_1x64_32 (
- _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
- unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
+ *pd++ = pack_1x128_32 (
+ _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
+ unpack_32_1x128 (m)),
+ unpack_32_1x128 (d)));
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
m = *pm++;
d = *pd;
- *pd++ = pack_1x64_32 (
- _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
- unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
+ *pd++ = pack_1x128_32 (
+ _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
+ unpack_32_1x128 (m)),
+ unpack_32_1x128 (d)));
w--;
}
}
-/* ---------------------------------------------------
- * fb_compose_setup_sSE2
- */
-static force_inline __m64
-create_mask_16_64 (uint16_t mask)
-{
- return _mm_set1_pi16 (mask);
-}
-
static force_inline __m128i
create_mask_16_128 (uint16_t mask)
{
return _mm_set1_epi16 (mask);
}
-static force_inline __m64
-create_mask_2x32_64 (uint32_t mask0,
- uint32_t mask1)
-{
- return _mm_set_pi32 (mask0, mask1);
-}
-
/* Work around a code generation bug in Sun Studio 12. */
#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
-# define create_mask_2x32_128(mask0, mask1) \
- (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
+# define create_mask_2x32_128(mask0, mask1) \
+ (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
#else
static force_inline __m128i
create_mask_2x32_128 (uint32_t mask0,
}
#endif
-/* SSE2 code patch for fbcompose.c */
-
-static void
-sse2_combine_over_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_over_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_over_reverse_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_over_reverse_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_in_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_in_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_in_reverse_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_reverse_in_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_out_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_out_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_out_reverse_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_reverse_out_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_atop_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_atop_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_xor_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_xor_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_add_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_add_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_saturate_u (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_saturate_u_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_src_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_src_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_over_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_over_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_in_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_in_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_out_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_out_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_atop_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_atop_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_xor_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_xor_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-static void
-sse2_combine_add_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- uint32_t * dst,
- const uint32_t * src,
- const uint32_t * mask,
- int width)
-{
- core_combine_add_ca_sse2 (dst, src, mask, width);
- _mm_empty ();
-}
-
-/* -------------------------------------------------------------------
- * composite_over_n_8888
- */
-
static void
sse2_composite_over_n_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
+ pixman_composite_info_t *info)
{
+ PIXMAN_COMPOSITE_ARGS (info);
uint32_t src;
uint32_t *dst_line, *dst, d;
- uint16_t w;
+ int32_t w;
int dst_stride;
__m128i xmm_src, xmm_alpha;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
if (src == 0)
return;
PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
xmm_src = expand_pixel_32_1x128 (src);
xmm_alpha = expand_alpha_1x128 (xmm_src);
{
dst = dst_line;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
dst_line += dst_stride;
w = width;
while (w && (unsigned long)dst & 15)
{
d = *dst;
- *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
- _mm_movepi64_pi64 (xmm_alpha),
- unpack_32_1x64 (d)));
+ *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
+ xmm_alpha,
+ unpack_32_1x128 (d)));
w--;
}
- cache_prefetch ((__m128i*)dst);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)dst);
-
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
while (w)
{
d = *dst;
- *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
- _mm_movepi64_pi64 (xmm_alpha),
- unpack_32_1x64 (d)));
+ *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
+ xmm_alpha,
+ unpack_32_1x128 (d)));
w--;
}
}
- _mm_empty ();
}
-/* ---------------------------------------------------------------------
- * composite_over_n_0565
- */
static void
sse2_composite_over_n_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
+ pixman_composite_info_t *info)
{
+ PIXMAN_COMPOSITE_ARGS (info);
uint32_t src;
uint16_t *dst_line, *dst, d;
- uint16_t w;
+ int32_t w;
int dst_stride;
__m128i xmm_src, xmm_alpha;
__m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
if (src == 0)
return;
PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+ dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
xmm_src = expand_pixel_32_1x128 (src);
xmm_alpha = expand_alpha_1x128 (xmm_src);
{
dst = dst_line;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
dst_line += dst_stride;
w = width;
d = *dst;
*dst++ = pack_565_32_16 (
- pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
- _mm_movepi64_pi64 (xmm_alpha),
- expand565_16_1x64 (d))));
+ pack_1x128_32 (over_1x128 (xmm_src,
+ xmm_alpha,
+ expand565_16_1x128 (d))));
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
while (w >= 8)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)dst);
-
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_565_128_4x128 (xmm_dst,
{
d = *dst;
*dst++ = pack_565_32_16 (
- pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
- _mm_movepi64_pi64 (xmm_alpha),
- expand565_16_1x64 (d))));
+ pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
+ expand565_16_1x128 (d))));
}
}
- _mm_empty ();
}
-/* ------------------------------
- * composite_add_n_8888_8888_ca
- */
static void
sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
+ pixman_composite_info_t *info)
{
- uint32_t src, srca;
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t src;
uint32_t *dst_line, d;
uint32_t *mask_line, m;
uint32_t pack_cmp;
int dst_stride, mask_stride;
- __m128i xmm_src, xmm_alpha;
+ __m128i xmm_src;
__m128i xmm_dst;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+ __m128i mmx_src, mmx_mask, mmx_dest;
+
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
- srca = src >> 24;
-
if (src == 0)
return;
PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
xmm_src = _mm_unpacklo_epi8 (
create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
- xmm_alpha = expand_alpha_1x128 (xmm_src);
- mmx_src = _mm_movepi64_pi64 (xmm_src);
- mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+ mmx_src = xmm_src;
while (height--)
{
dst_line += dst_stride;
mask_line += mask_stride;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
m = *pm++;
if (m)
{
d = *pd;
-
- mmx_mask = unpack_32_1x64 (m);
- mmx_dest = unpack_32_1x64 (d);
- *pd = pack_1x64_32 (
- _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
+ mmx_mask = unpack_32_1x128 (m);
+ mmx_dest = unpack_32_1x128 (d);
+
+ *pd = pack_1x128_32 (
+ _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
+ mmx_dest));
}
pd++;
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_mask = load_128_unaligned ((__m128i*)pm);
pack_cmp =
&xmm_mask_lo, &xmm_mask_hi,
&xmm_mask_lo, &xmm_mask_hi);
xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
-
+
save_128_aligned (
(__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
}
{
d = *pd;
- mmx_mask = unpack_32_1x64 (m);
- mmx_dest = unpack_32_1x64 (d);
+ mmx_mask = unpack_32_1x128 (m);
+ mmx_dest = unpack_32_1x128 (d);
- *pd = pack_1x64_32 (
- _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
+ *pd = pack_1x128_32 (
+ _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
+ mmx_dest));
}
pd++;
}
}
- _mm_empty ();
}
-/* ---------------------------------------------------------------------------
- * composite_over_n_8888_8888_ca
- */
-
static void
sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
+ pixman_composite_info_t *info)
{
+ PIXMAN_COMPOSITE_ARGS (info);
uint32_t src;
uint32_t *dst_line, d;
uint32_t *mask_line, m;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+ __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
if (src == 0)
return;
PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
xmm_src = _mm_unpacklo_epi8 (
create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
xmm_alpha = expand_alpha_1x128 (xmm_src);
- mmx_src = _mm_movepi64_pi64 (xmm_src);
- mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+ mmx_src = xmm_src;
+ mmx_alpha = xmm_alpha;
while (height--)
{
dst_line += dst_stride;
mask_line += mask_stride;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
m = *pm++;
if (m)
{
d = *pd;
- mmx_mask = unpack_32_1x64 (m);
- mmx_dest = unpack_32_1x64 (d);
+ mmx_mask = unpack_32_1x128 (m);
+ mmx_dest = unpack_32_1x128 (d);
- *pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
+ *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
&mmx_alpha,
&mmx_mask,
&mmx_dest));
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_mask = load_128_unaligned ((__m128i*)pm);
pack_cmp =
if (m)
{
d = *pd;
- mmx_mask = unpack_32_1x64 (m);
- mmx_dest = unpack_32_1x64 (d);
+ mmx_mask = unpack_32_1x128 (m);
+ mmx_dest = unpack_32_1x128 (d);
- *pd = pack_1x64_32 (
- in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
+ *pd = pack_1x128_32 (
+ in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
}
pd++;
}
}
- _mm_empty ();
}
-/*---------------------------------------------------------------------
- * composite_over_8888_n_8888
- */
-
static void
sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
+ pixman_composite_info_t *info)
{
+ PIXMAN_COMPOSITE_ARGS (info);
uint32_t *dst_line, *dst;
uint32_t *src_line, *src;
uint32_t mask;
- uint16_t w;
+ int32_t w;
int dst_stride, src_stride;
__m128i xmm_mask;
__m128i xmm_alpha_lo, xmm_alpha_hi;
PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
- mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
+ mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
xmm_mask = create_mask_16_128 (mask >> 24);
src_line += src_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
- cache_prefetch ((__m128i*)src);
-
while (w && (unsigned long)dst & 15)
{
uint32_t s = *src++;
- uint32_t d = *dst;
-
- __m64 ms = unpack_32_1x64 (s);
- __m64 alpha = expand_alpha_1x64 (ms);
- __m64 dest = _mm_movepi64_pi64 (xmm_mask);
- __m64 alpha_dst = unpack_32_1x64 (d);
-
- *dst++ = pack_1x64_32 (
- in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
+ if (s)
+ {
+ uint32_t d = *dst;
+
+ __m128i ms = unpack_32_1x128 (s);
+ __m128i alpha = expand_alpha_1x128 (ms);
+ __m128i dest = xmm_mask;
+ __m128i alpha_dst = unpack_32_1x128 (d);
+
+ *dst = pack_1x128_32 (
+ in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+ }
+ dst++;
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
- cache_prefetch ((__m128i*)src);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)dst);
- cache_prefetch_next ((__m128i*)src);
-
xmm_src = load_128_unaligned ((__m128i*)src);
- xmm_dst = load_128_aligned ((__m128i*)dst);
-
- unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
- expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi);
-
- in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
- &xmm_alpha_lo, &xmm_alpha_hi,
- &xmm_mask, &xmm_mask,
- &xmm_dst_lo, &xmm_dst_hi);
-
- save_128_aligned (
- (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ if (!is_zero (xmm_src))
+ {
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_mask, &xmm_mask,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
dst += 4;
src += 4;
w -= 4;
while (w)
{
uint32_t s = *src++;
- uint32_t d = *dst;
- __m64 ms = unpack_32_1x64 (s);
- __m64 alpha = expand_alpha_1x64 (ms);
- __m64 mask = _mm_movepi64_pi64 (xmm_mask);
- __m64 dest = unpack_32_1x64 (d);
+ if (s)
+ {
+ uint32_t d = *dst;
+
+ __m128i ms = unpack_32_1x128 (s);
+ __m128i alpha = expand_alpha_1x128 (ms);
+ __m128i mask = xmm_mask;
+ __m128i dest = unpack_32_1x128 (d);
+
+ *dst = pack_1x128_32 (
+ in_over_1x128 (&ms, &alpha, &mask, &dest));
+ }
+
+ dst++;
+ w--;
+ }
+ }
+
+}
+
+static void
+sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ int32_t w;
+ int dst_stride, src_stride;
+
+
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ *dst++ = *src++ | 0xff000000;
+ w--;
+ }
- *dst++ = pack_1x64_32 (
- in_over_1x64 (&ms, &alpha, &mask, &dest));
+ while (w >= 16)
+ {
+ __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
+
+ xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
+ xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
+ xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
+ xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
+
+ save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
+ save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
+ save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
+ save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
+
+ dst += 16;
+ src += 16;
+ w -= 16;
+ }
+ while (w)
+ {
+ *dst++ = *src++ | 0xff000000;
w--;
}
}
- _mm_empty ();
}
-/* ---------------------------------------------------------------------
- * composite_over_x888_n_8888
- */
static void
sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
+ pixman_composite_info_t *info)
{
+ PIXMAN_COMPOSITE_ARGS (info);
uint32_t *dst_line, *dst;
uint32_t *src_line, *src;
uint32_t mask;
int dst_stride, src_stride;
- uint16_t w;
+ int32_t w;
__m128i xmm_mask, xmm_alpha;
__m128i xmm_src, xmm_src_lo, xmm_src_hi;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
- mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
+ mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
xmm_mask = create_mask_16_128 (mask >> 24);
xmm_alpha = mask_00ff;
src_line += src_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
- cache_prefetch ((__m128i*)src);
-
while (w && (unsigned long)dst & 15)
{
uint32_t s = (*src++) | 0xff000000;
uint32_t d = *dst;
- __m64 src = unpack_32_1x64 (s);
- __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
- __m64 mask = _mm_movepi64_pi64 (xmm_mask);
- __m64 dest = unpack_32_1x64 (d);
+ __m128i src = unpack_32_1x128 (s);
+ __m128i alpha = xmm_alpha;
+ __m128i mask = xmm_mask;
+ __m128i dest = unpack_32_1x128 (d);
- *dst++ = pack_1x64_32 (
- in_over_1x64 (&src, &alpha, &mask, &dest));
+ *dst++ = pack_1x128_32 (
+ in_over_1x128 (&src, &alpha, &mask, &dest));
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
- cache_prefetch ((__m128i*)src);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)dst);
- cache_prefetch_next ((__m128i*)src);
-
xmm_src = _mm_or_si128 (
load_128_unaligned ((__m128i*)src), mask_ff000000);
xmm_dst = load_128_aligned ((__m128i*)dst);
uint32_t s = (*src++) | 0xff000000;
uint32_t d = *dst;
- __m64 src = unpack_32_1x64 (s);
- __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
- __m64 mask = _mm_movepi64_pi64 (xmm_mask);
- __m64 dest = unpack_32_1x64 (d);
+ __m128i src = unpack_32_1x128 (s);
+ __m128i alpha = xmm_alpha;
+ __m128i mask = xmm_mask;
+ __m128i dest = unpack_32_1x128 (d);
- *dst++ = pack_1x64_32 (
- in_over_1x64 (&src, &alpha, &mask, &dest));
+ *dst++ = pack_1x128_32 (
+ in_over_1x128 (&src, &alpha, &mask, &dest));
w--;
}
}
- _mm_empty ();
}
-/* --------------------------------------------------------------------
- * composite_over_8888_8888
- */
static void
sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
+ pixman_composite_info_t *info)
{
+ PIXMAN_COMPOSITE_ARGS (info);
int dst_stride, src_stride;
uint32_t *dst_line, *dst;
uint32_t *src_line, *src;
PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
while (height--)
{
- core_combine_over_u_sse2 (dst, src, NULL, width);
+ sse2_combine_over_u (imp, op, dst, src, NULL, width);
dst += dst_stride;
src += src_stride;
}
- _mm_empty ();
}
-/* ------------------------------------------------------------------
- * composite_over_8888_0565
- */
static force_inline uint16_t
composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
{
- __m64 ms;
+ __m128i ms;
- ms = unpack_32_1x64 (src);
+ ms = unpack_32_1x128 (src);
return pack_565_32_16 (
- pack_1x64_32 (
- over_1x64 (
- ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
+ pack_1x128_32 (
+ over_1x128 (
+ ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
}
static void
sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
+ pixman_composite_info_t *info)
{
+ PIXMAN_COMPOSITE_ARGS (info);
uint16_t *dst_line, *dst, d;
uint32_t *src_line, *src, s;
int dst_stride, src_stride;
- uint16_t w;
+ int32_t w;
__m128i xmm_alpha_lo, xmm_alpha_hi;
__m128i xmm_src, xmm_src_lo, xmm_src_hi;
__m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+ dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-#if 0
- /* FIXME
- *
- * I copy the code from MMX one and keep the fixme.
- * If it's a problem there, probably is a problem here.
- */
- assert (src_image->drawable == mask_image->drawable);
-#endif
-
while (height--)
{
dst = dst_line;
src = src_line;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
dst_line += dst_stride;
src_line += src_stride;
w = width;
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
/* It's a 8 pixel loop */
while (w >= 8)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)src);
- cache_prefetch_next ((__m128i*)dst);
-
/* I'm loading unaligned because I'm not sure
* about the address alignment.
*/
}
}
- _mm_empty ();
}
-/* -----------------------------------------------------------------
- * composite_over_n_8_8888
- */
-
static void
sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
+ pixman_composite_info_t *info)
{
+ PIXMAN_COMPOSITE_ARGS (info);
uint32_t src, srca;
uint32_t *dst_line, *dst;
uint8_t *mask_line, *mask;
int dst_stride, mask_stride;
- uint16_t w;
+ int32_t w;
uint32_t m, d;
__m128i xmm_src, xmm_alpha, xmm_def;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+ __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
srca = src >> 24;
if (src == 0)
return;
PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
xmm_def = create_mask_2x32_128 (src, src);
xmm_src = expand_pixel_32_1x128 (src);
xmm_alpha = expand_alpha_1x128 (xmm_src);
- mmx_src = _mm_movepi64_pi64 (xmm_src);
- mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+ mmx_src = xmm_src;
+ mmx_alpha = xmm_alpha;
while (height--)
{
mask_line += mask_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w && (unsigned long)dst & 15)
{
uint8_t m = *mask++;
if (m)
{
d = *dst;
- mmx_mask = expand_pixel_8_1x64 (m);
- mmx_dest = unpack_32_1x64 (d);
+ mmx_mask = expand_pixel_8_1x128 (m);
+ mmx_dest = unpack_32_1x128 (d);
- *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
+ *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
&mmx_alpha,
&mmx_mask,
&mmx_dest));
dst++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)mask);
- cache_prefetch_next ((__m128i*)dst);
-
m = *((uint32_t*)mask);
if (srca == 0xff && m == 0xffffffff)
if (m)
{
d = *dst;
- mmx_mask = expand_pixel_8_1x64 (m);
- mmx_dest = unpack_32_1x64 (d);
+ mmx_mask = expand_pixel_8_1x128 (m);
+ mmx_dest = unpack_32_1x128 (d);
- *dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
+ *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
&mmx_alpha,
&mmx_mask,
&mmx_dest));
}
}
- _mm_empty ();
}
-/* ----------------------------------------------------------------
- * composite_over_n_8_8888
- */
-
-pixman_bool_t
+static pixman_bool_t
pixman_fill_sse2 (uint32_t *bits,
int stride,
int bpp,
__m128i xmm_def;
- if (bpp == 16 && (data >> 16 != (data & 0xffff)))
- return FALSE;
+ if (bpp == 8)
+ {
+ uint8_t b;
+ uint16_t w;
- if (bpp != 16 && bpp != 32)
- return FALSE;
+ stride = stride * (int) sizeof (uint32_t) / 1;
+ byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
+ byte_width = width;
+ stride *= 1;
- if (bpp == 16)
+ b = data & 0xff;
+ w = (b << 8) | b;
+ data = (w << 16) | w;
+ }
+ else if (bpp == 16)
{
stride = stride * (int) sizeof (uint32_t) / 2;
byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
byte_width = 2 * width;
stride *= 2;
+
+ data = (data & 0xffff) * 0x00010001;
}
- else
+ else if (bpp == 32)
{
stride = stride * (int) sizeof (uint32_t) / 4;
byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
byte_width = 4 * width;
stride *= 4;
}
+ else
+ {
+ return FALSE;
+ }
- cache_prefetch ((__m128i*)byte_line);
xmm_def = create_mask_2x32_128 (data, data);
while (height--)
byte_line += stride;
w = byte_width;
-
- cache_prefetch_next ((__m128i*)d);
+ if (w >= 1 && ((unsigned long)d & 1))
+ {
+ *(uint8_t *)d = data;
+ w -= 1;
+ d += 1;
+ }
while (w >= 2 && ((unsigned long)d & 3))
{
d += 4;
}
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 128)
{
- cache_prefetch (((__m128i*)d) + 12);
-
save_128_aligned ((__m128i*)(d), xmm_def);
save_128_aligned ((__m128i*)(d + 16), xmm_def);
save_128_aligned ((__m128i*)(d + 32), xmm_def);
if (w >= 64)
{
- cache_prefetch (((__m128i*)d) + 8);
-
save_128_aligned ((__m128i*)(d), xmm_def);
save_128_aligned ((__m128i*)(d + 16), xmm_def);
save_128_aligned ((__m128i*)(d + 32), xmm_def);
w -= 64;
}
- cache_prefetch_next ((__m128i*)d);
-
if (w >= 32)
{
save_128_aligned ((__m128i*)(d), xmm_def);
w -= 16;
}
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 4)
{
*(uint32_t *)d = data;
w -= 2;
d += 2;
}
+
+ if (w >= 1)
+ {
+ *(uint8_t *)d = data;
+ w -= 1;
+ d += 1;
+ }
}
- _mm_empty ();
return TRUE;
}
static void
sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
+ pixman_composite_info_t *info)
{
+ PIXMAN_COMPOSITE_ARGS (info);
uint32_t src, srca;
uint32_t *dst_line, *dst;
uint8_t *mask_line, *mask;
int dst_stride, mask_stride;
- uint16_t w;
+ int32_t w;
uint32_t m;
__m128i xmm_src, xmm_def;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
srca = src >> 24;
if (src == 0)
{
- pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
- PIXMAN_FORMAT_BPP (dst_image->bits.format),
+ pixman_fill_sse2 (dest_image->bits.bits, dest_image->bits.rowstride,
+ PIXMAN_FORMAT_BPP (dest_image->bits.format),
dest_x, dest_y, width, height, 0);
return;
}
PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
mask_line += mask_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w && (unsigned long)dst & 15)
{
uint8_t m = *mask++;
if (m)
{
- *dst = pack_1x64_32 (
- pix_multiply_1x64 (
- _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
+ *dst = pack_1x128_32 (
+ pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
}
else
{
dst++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)mask);
- cache_prefetch_next ((__m128i*)dst);
-
m = *((uint32_t*)mask);
if (srca == 0xff && m == 0xffffffff)
if (m)
{
- *dst = pack_1x64_32 (
- pix_multiply_1x64 (
- _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
+ *dst = pack_1x128_32 (
+ pix_multiply_1x128 (
+ xmm_src, expand_pixel_8_1x128 (m)));
}
else
{
}
}
- _mm_empty ();
}
-/*-----------------------------------------------------------------------
- * composite_over_n_8_0565
- */
-
static void
sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
+ pixman_composite_info_t *info)
{
- uint32_t src, srca;
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t src;
uint16_t *dst_line, *dst, d;
uint8_t *mask_line, *mask;
int dst_stride, mask_stride;
- uint16_t w;
+ int32_t w;
uint32_t m;
- __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+ __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
__m128i xmm_src, xmm_alpha;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
__m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
- srca = src >> 24;
if (src == 0)
return;
PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+ dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
xmm_src = expand_pixel_32_1x128 (src);
xmm_alpha = expand_alpha_1x128 (xmm_src);
- mmx_src = _mm_movepi64_pi64 (xmm_src);
- mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+ mmx_src = xmm_src;
+ mmx_alpha = xmm_alpha;
while (height--)
{
mask_line += mask_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w && (unsigned long)dst & 15)
{
m = *mask++;
if (m)
{
d = *dst;
- mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
- mmx_dest = expand565_16_1x64 (d);
+ mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+ mmx_dest = expand565_16_1x128 (d);
*dst = pack_565_32_16 (
- pack_1x64_32 (
- in_over_1x64 (
+ pack_1x128_32 (
+ in_over_1x128 (
&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
}
dst++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 8)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)mask);
- cache_prefetch_next ((__m128i*)dst);
-
xmm_dst = load_128_aligned ((__m128i*) dst);
unpack_565_128_4x128 (xmm_dst,
&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
if (m)
{
d = *dst;
- mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
- mmx_dest = expand565_16_1x64 (d);
+ mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+ mmx_dest = expand565_16_1x128 (d);
*dst = pack_565_32_16 (
- pack_1x64_32 (
- in_over_1x64 (
+ pack_1x128_32 (
+ in_over_1x128 (
&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
}
}
}
- _mm_empty ();
}
-/* -----------------------------------------------------------------------
- * composite_over_pixbuf_0565
- */
-
static void
sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
+ pixman_composite_info_t *info)
{
+ PIXMAN_COMPOSITE_ARGS (info);
uint16_t *dst_line, *dst, d;
uint32_t *src_line, *src, s;
int dst_stride, src_stride;
- uint16_t w;
+ int32_t w;
uint32_t opaque, zero;
- __m64 ms;
+ __m128i ms;
__m128i xmm_src, xmm_src_lo, xmm_src_hi;
__m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+ dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-#if 0
- /* FIXME
- *
- * I copy the code from MMX one and keep the fixme.
- * If it's a problem there, probably is a problem here.
- */
- assert (src_image->drawable == mask_image->drawable);
-#endif
-
while (height--)
{
dst = dst_line;
src_line += src_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
while (w && (unsigned long)dst & 15)
{
s = *src++;
d = *dst;
- ms = unpack_32_1x64 (s);
+ ms = unpack_32_1x128 (s);
*dst++ = pack_565_32_16 (
- pack_1x64_32 (
- over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
+ pack_1x128_32 (
+ over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 8)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)src);
- cache_prefetch_next ((__m128i*)dst);
-
/* First round */
xmm_src = load_128_unaligned ((__m128i*)src);
xmm_dst = load_128_aligned ((__m128i*)dst);
s = *src++;
d = *dst;
- ms = unpack_32_1x64 (s);
+ ms = unpack_32_1x128 (s);
*dst++ = pack_565_32_16 (
- pack_1x64_32 (
- over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
+ pack_1x128_32 (
+ over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
w--;
}
}
- _mm_empty ();
}
-/* -------------------------------------------------------------------------
- * composite_over_pixbuf_8888
- */
-
static void
sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
+ pixman_composite_info_t *info)
{
+ PIXMAN_COMPOSITE_ARGS (info);
uint32_t *dst_line, *dst, d;
uint32_t *src_line, *src, s;
int dst_stride, src_stride;
- uint16_t w;
+ int32_t w;
uint32_t opaque, zero;
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-#if 0
- /* FIXME
- *
- * I copy the code from MMX one and keep the fixme.
- * If it's a problem there, probably is a problem here.
- */
- assert (src_image->drawable == mask_image->drawable);
-#endif
-
while (height--)
{
dst = dst_line;
src_line += src_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
while (w && (unsigned long)dst & 15)
{
s = *src++;
d = *dst;
- *dst++ = pack_1x64_32 (
- over_rev_non_pre_1x64 (
- unpack_32_1x64 (s), unpack_32_1x64 (d)));
+ *dst++ = pack_1x128_32 (
+ over_rev_non_pre_1x128 (
+ unpack_32_1x128 (s), unpack_32_1x128 (d)));
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)src);
- cache_prefetch_next ((__m128i*)dst);
-
xmm_src_hi = load_128_unaligned ((__m128i*)src);
opaque = is_opaque (xmm_src_hi);
s = *src++;
d = *dst;
- *dst++ = pack_1x64_32 (
- over_rev_non_pre_1x64 (
- unpack_32_1x64 (s), unpack_32_1x64 (d)));
+ *dst++ = pack_1x128_32 (
+ over_rev_non_pre_1x128 (
+ unpack_32_1x128 (s), unpack_32_1x128 (d)));
w--;
}
}
- _mm_empty ();
}
-/* -------------------------------------------------------------------------------------------------
- * composite_over_n_8888_0565_ca
- */
-
static void
sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
+ pixman_composite_info_t *info)
{
+ PIXMAN_COMPOSITE_ARGS (info);
uint32_t src;
uint16_t *dst_line, *dst, d;
uint32_t *mask_line, *mask, m;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
__m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
- __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+ __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
if (src == 0)
return;
PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+ dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
xmm_src = expand_pixel_32_1x128 (src);
xmm_alpha = expand_alpha_1x128 (xmm_src);
- mmx_src = _mm_movepi64_pi64 (xmm_src);
- mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+ mmx_src = xmm_src;
+ mmx_alpha = xmm_alpha;
while (height--)
{
mask_line += mask_stride;
dst_line += dst_stride;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w && ((unsigned long)dst & 15))
{
m = *(uint32_t *) mask;
if (m)
{
d = *dst;
- mmx_mask = unpack_32_1x64 (m);
- mmx_dest = expand565_16_1x64 (d);
+ mmx_mask = unpack_32_1x128 (m);
+ mmx_dest = expand565_16_1x128 (d);
*dst = pack_565_32_16 (
- pack_1x64_32 (
- in_over_1x64 (
+ pack_1x128_32 (
+ in_over_1x128 (
&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
}
mask++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 8)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)mask);
- cache_prefetch_next ((__m128i*)dst);
-
/* First round */
xmm_mask = load_128_unaligned ((__m128i*)mask);
xmm_dst = load_128_aligned ((__m128i*)dst);
if (m)
{
d = *dst;
- mmx_mask = unpack_32_1x64 (m);
- mmx_dest = expand565_16_1x64 (d);
+ mmx_mask = unpack_32_1x128 (m);
+ mmx_dest = expand565_16_1x128 (d);
*dst = pack_565_32_16 (
- pack_1x64_32 (
- in_over_1x64 (
+ pack_1x128_32 (
+ in_over_1x128 (
&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
}
}
}
- _mm_empty ();
}
-/* -----------------------------------------------------------------------
- * composite_in_n_8_8
- */
-
static void
sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
+ pixman_composite_info_t *info)
{
+ PIXMAN_COMPOSITE_ARGS (info);
uint8_t *dst_line, *dst;
uint8_t *mask_line, *mask;
int dst_stride, mask_stride;
- uint16_t w, d, m;
+ uint32_t d, m;
uint32_t src;
- uint8_t sa;
+ int32_t w;
__m128i xmm_alpha;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+ dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- sa = src >> 24;
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
mask_line += mask_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w && ((unsigned long)dst & 15))
{
m = (uint32_t) *mask++;
d = (uint32_t) *dst;
- *dst++ = (uint8_t) pack_1x64_32 (
- pix_multiply_1x64 (
- pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
- unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
+ *dst++ = (uint8_t) pack_1x128_32 (
+ pix_multiply_1x128 (
+ pix_multiply_1x128 (xmm_alpha,
+ unpack_32_1x128 (m)),
+ unpack_32_1x128 (d)));
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 16)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)mask);
- cache_prefetch_next ((__m128i*)dst);
-
xmm_mask = load_128_unaligned ((__m128i*)mask);
xmm_dst = load_128_aligned ((__m128i*)dst);
m = (uint32_t) *mask++;
d = (uint32_t) *dst;
- *dst++ = (uint8_t) pack_1x64_32 (
- pix_multiply_1x64 (
- pix_multiply_1x64 (
- _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
+ *dst++ = (uint8_t) pack_1x128_32 (
+ pix_multiply_1x128 (
+ pix_multiply_1x128 (
+ xmm_alpha, unpack_32_1x128 (m)),
+ unpack_32_1x128 (d)));
w--;
}
}
- _mm_empty ();
}
-/* ---------------------------------------------------------------------------
- * composite_in_8_8
- */
+static void
+sse2_composite_in_n_8 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint8_t *dst_line, *dst;
+ int dst_stride;
+ uint32_t d;
+ uint32_t src;
+ int32_t w;
+
+ __m128i xmm_alpha;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+ xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
+
+ src = src >> 24;
+
+ if (src == 0xff)
+ return;
+
+ if (src == 0x00)
+ {
+ pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
+ 8, dest_x, dest_y, width, height, src);
+
+ return;
+ }
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ w = width;
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x128_32 (
+ pix_multiply_1x128 (
+ xmm_alpha,
+ unpack_32_1x128 (d)));
+ w--;
+ }
+
+ while (w >= 16)
+ {
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+ &xmm_dst_lo, &xmm_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ dst += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x128_32 (
+ pix_multiply_1x128 (
+ xmm_alpha,
+ unpack_32_1x128 (d)));
+ w--;
+ }
+ }
+
+}
static void
sse2_composite_in_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
+ pixman_composite_info_t *info)
{
+ PIXMAN_COMPOSITE_ARGS (info);
uint8_t *dst_line, *dst;
uint8_t *src_line, *src;
int src_stride, dst_stride;
- uint16_t w;
+ int32_t w;
uint32_t s, d;
__m128i xmm_src, xmm_src_lo, xmm_src_hi;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+ dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
src_line += src_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
while (w && ((unsigned long)dst & 15))
{
s = (uint32_t) *src++;
d = (uint32_t) *dst;
- *dst++ = (uint8_t) pack_1x64_32 (
- pix_multiply_1x64 (
- unpack_32_1x64 (s), unpack_32_1x64 (d)));
+ *dst++ = (uint8_t) pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (s), unpack_32_1x128 (d)));
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 16)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)src);
- cache_prefetch_next ((__m128i*)dst);
-
xmm_src = load_128_unaligned ((__m128i*)src);
xmm_dst = load_128_aligned ((__m128i*)dst);
s = (uint32_t) *src++;
d = (uint32_t) *dst;
- *dst++ = (uint8_t) pack_1x64_32 (
- pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
+ *dst++ = (uint8_t) pack_1x128_32 (
+ pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
w--;
}
}
- _mm_empty ();
}
-/* -------------------------------------------------------------------------
- * composite_add_n_8_8
- */
-
static void
sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
+ pixman_composite_info_t *info)
{
+ PIXMAN_COMPOSITE_ARGS (info);
uint8_t *dst_line, *dst;
uint8_t *mask_line, *mask;
int dst_stride, mask_stride;
- uint16_t w;
+ int32_t w;
uint32_t src;
- uint8_t sa;
uint32_t m, d;
__m128i xmm_alpha;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+ dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
- src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
- sa = src >> 24;
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
mask_line += mask_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w && ((unsigned long)dst & 15))
{
m = (uint32_t) *mask++;
d = (uint32_t) *dst;
- *dst++ = (uint8_t) pack_1x64_32 (
- _mm_adds_pu16 (
- pix_multiply_1x64 (
- _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
+ *dst++ = (uint8_t) pack_1x128_32 (
+ _mm_adds_epu16 (
+ pix_multiply_1x128 (
+ xmm_alpha, unpack_32_1x128 (m)),
+ unpack_32_1x128 (d)));
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 16)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)mask);
- cache_prefetch_next ((__m128i*)dst);
-
xmm_mask = load_128_unaligned ((__m128i*)mask);
xmm_dst = load_128_aligned ((__m128i*)dst);
m = (uint32_t) *mask++;
d = (uint32_t) *dst;
- *dst++ = (uint8_t) pack_1x64_32 (
- _mm_adds_pu16 (
- pix_multiply_1x64 (
- _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
+ *dst++ = (uint8_t) pack_1x128_32 (
+ _mm_adds_epu16 (
+ pix_multiply_1x128 (
+ xmm_alpha, unpack_32_1x128 (m)),
+ unpack_32_1x128 (d)));
w--;
}
}
- _mm_empty ();
}
-/* ----------------------------------------------------------------------
- * composite_add_8000_8000
- */
-
static void
-sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
+sse2_composite_add_n_8 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
{
+ PIXMAN_COMPOSITE_ARGS (info);
uint8_t *dst_line, *dst;
- uint8_t *src_line, *src;
- int dst_stride, src_stride;
- uint16_t w;
- uint16_t t;
+ int dst_stride;
+ int32_t w;
+ uint32_t src;
+
+ __m128i xmm_src;
PIXMAN_IMAGE_GET_LINE (
- src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
- PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+ dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
- while (height--)
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+ src >>= 24;
+
+ if (src == 0x00)
+ return;
+
+ if (src == 0xff)
{
- dst = dst_line;
- src = src_line;
+ pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
+ 8, dest_x, dest_y, width, height, 0xff);
+
+ return;
+ }
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
+ src = (src << 24) | (src << 16) | (src << 8) | src;
+ xmm_src = _mm_set_epi32 (src, src, src, src);
+ while (height--)
+ {
+ dst = dst_line;
dst_line += dst_stride;
- src_line += src_stride;
w = width;
- /* Small head */
- while (w && (unsigned long)dst & 3)
+ while (w && ((unsigned long)dst & 15))
{
- t = (*dst) + (*src++);
- *dst++ = t | (0 - (t >> 8));
+ *dst = (uint8_t)_mm_cvtsi128_si32 (
+ _mm_adds_epu8 (
+ xmm_src,
+ _mm_cvtsi32_si128 (*dst)));
+
w--;
+ dst++;
}
- core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
-
- /* Small tail */
- dst += w & 0xfffc;
- src += w & 0xfffc;
+ while (w >= 16)
+ {
+ save_128_aligned (
+ (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
- w &= 3;
+ dst += 16;
+ w -= 16;
+ }
while (w)
{
- t = (*dst) + (*src++);
- *dst++ = t | (0 - (t >> 8));
+ *dst = (uint8_t)_mm_cvtsi128_si32 (
+ _mm_adds_epu8 (
+ xmm_src,
+ _mm_cvtsi32_si128 (*dst)));
+
+ w--;
+ dst++;
+ }
+ }
+
+}
+
+static void
+sse2_composite_add_8_8 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint8_t *dst_line, *dst;
+ uint8_t *src_line, *src;
+ int dst_stride, src_stride;
+ int32_t w;
+ uint16_t t;
+
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ src = src_line;
+
+ dst_line += dst_stride;
+ src_line += src_stride;
+ w = width;
+
+ /* Small head */
+ while (w && (unsigned long)dst & 3)
+ {
+ t = (*dst) + (*src++);
+ *dst++ = t | (0 - (t >> 8));
+ w--;
+ }
+
+ sse2_combine_add_u (imp, op,
+ (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
+
+ /* Small tail */
+ dst += w & 0xfffc;
+ src += w & 0xfffc;
+
+ w &= 3;
+
+ while (w)
+ {
+ t = (*dst) + (*src++);
+ *dst++ = t | (0 - (t >> 8));
w--;
}
}
- _mm_empty ();
}
-/* ---------------------------------------------------------------------
- * composite_add_8888_8888
- */
static void
sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
+ pixman_composite_info_t *info)
{
+ PIXMAN_COMPOSITE_ARGS (info);
uint32_t *dst_line, *dst;
uint32_t *src_line, *src;
int dst_stride, src_stride;
PIXMAN_IMAGE_GET_LINE (
src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
while (height--)
{
src = src_line;
src_line += src_stride;
- core_combine_add_u_sse2 (dst, src, NULL, width);
+ sse2_combine_add_u (imp, op, dst, src, NULL, width);
}
- _mm_empty ();
}
-/* -------------------------------------------------------------------------------------------------
- * sse2_composite_copy_area
- */
-
static pixman_bool_t
pixman_blt_sse2 (uint32_t *src_bits,
uint32_t *dst_bits,
int dst_bpp,
int src_x,
int src_y,
- int dst_x,
- int dst_y,
+ int dest_x,
+ int dest_y,
int width,
int height)
{
src_stride = src_stride * (int) sizeof (uint32_t) / 2;
dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
- dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
+ dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
byte_width = 2 * width;
src_stride *= 2;
dst_stride *= 2;
src_stride = src_stride * (int) sizeof (uint32_t) / 4;
dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
- dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
+ dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
byte_width = 4 * width;
src_stride *= 4;
dst_stride *= 4;
return FALSE;
}
- cache_prefetch ((__m128i*)src_bytes);
- cache_prefetch ((__m128i*)dst_bytes);
-
while (height--)
{
int w;
dst_bytes += dst_stride;
w = byte_width;
- cache_prefetch_next ((__m128i*)s);
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 2 && ((unsigned long)d & 3))
{
*(uint16_t *)d = *(uint16_t *)s;
d += 4;
}
- cache_prefetch_next ((__m128i*)s);
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 64)
{
__m128i xmm0, xmm1, xmm2, xmm3;
- /* 128 bytes ahead */
- cache_prefetch (((__m128i*)s) + 8);
- cache_prefetch (((__m128i*)d) + 8);
-
xmm0 = load_128_unaligned ((__m128i*)(s));
xmm1 = load_128_unaligned ((__m128i*)(s + 16));
xmm2 = load_128_unaligned ((__m128i*)(s + 32));
w -= 64;
}
- cache_prefetch_next ((__m128i*)s);
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 16)
{
save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
s += 16;
}
- cache_prefetch_next ((__m128i*)s);
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 4)
{
*(uint32_t *)d = *(uint32_t *)s;
}
}
- _mm_empty ();
return TRUE;
}
static void
sse2_composite_copy_area (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
+ pixman_composite_info_t *info)
{
+ PIXMAN_COMPOSITE_ARGS (info);
pixman_blt_sse2 (src_image->bits.bits,
- dst_image->bits.bits,
+ dest_image->bits.bits,
src_image->bits.rowstride,
- dst_image->bits.rowstride,
+ dest_image->bits.rowstride,
PIXMAN_FORMAT_BPP (src_image->bits.format),
- PIXMAN_FORMAT_BPP (dst_image->bits.format),
+ PIXMAN_FORMAT_BPP (dest_image->bits.format),
src_x, src_y, dest_x, dest_y, width, height);
}
static void
sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
+ pixman_composite_info_t *info)
{
+ PIXMAN_COMPOSITE_ARGS (info);
uint32_t *src, *src_line, s;
uint32_t *dst, *dst_line, d;
uint8_t *mask, *mask_line;
uint32_t m;
int src_stride, mask_stride, dst_stride;
- uint16_t w;
- __m64 ms;
+ int32_t w;
+ __m128i ms;
__m128i xmm_src, xmm_src_lo, xmm_src_hi;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
PIXMAN_IMAGE_GET_LINE (
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
- cache_prefetch ((__m128i*)mask);
-
while (w && (unsigned long)dst & 15)
{
s = 0xff000000 | *src++;
m = (uint32_t) *mask++;
d = *dst;
- ms = unpack_32_1x64 (s);
+ ms = unpack_32_1x128 (s);
if (m != 0xff)
{
- __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
- __m64 md = unpack_32_1x64 (d);
+ __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+ __m128i md = unpack_32_1x128 (d);
- ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
+ ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
}
- *dst++ = pack_1x64_32 (ms);
+ *dst++ = pack_1x128_32 (ms);
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
- cache_prefetch ((__m128i*)mask);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)src);
- cache_prefetch_next ((__m128i*)dst);
- cache_prefetch_next ((__m128i*)mask);
-
m = *(uint32_t*) mask;
- xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
+ xmm_src = _mm_or_si128 (
+ load_128_unaligned ((__m128i*)src), mask_ff000000);
if (m == 0xffffffff)
{
unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
- expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+ expand_alpha_rev_2x128 (
+ xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
- in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
}
}
else
{
- __m64 ma, md, ms;
+ __m128i ma, md, ms;
d = *dst;
- ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
- md = unpack_32_1x64 (d);
- ms = unpack_32_1x64 (s);
+ ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+ md = unpack_32_1x128 (d);
+ ms = unpack_32_1x128 (s);
- *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
+ *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
}
}
}
}
- _mm_empty ();
}
static void
sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src_image,
- pixman_image_t * mask_image,
- pixman_image_t * dst_image,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
+ pixman_composite_info_t *info)
{
+ PIXMAN_COMPOSITE_ARGS (info);
uint32_t *src, *src_line, s;
uint32_t *dst, *dst_line, d;
uint8_t *mask, *mask_line;
uint32_t m;
int src_stride, mask_stride, dst_stride;
- uint16_t w;
+ int32_t w;
__m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
PIXMAN_IMAGE_GET_LINE (
- dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
PIXMAN_IMAGE_GET_LINE (
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i *)src);
- cache_prefetch ((__m128i *)dst);
- cache_prefetch ((__m128i *)mask);
-
while (w && (unsigned long)dst & 15)
{
uint32_t sa;
}
else
{
- __m64 ms, md, ma, msa;
+ __m128i ms, md, ma, msa;
- ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
- ms = unpack_32_1x64 (s);
- md = unpack_32_1x64 (d);
+ ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+ ms = unpack_32_1x128 (s);
+ md = unpack_32_1x128 (d);
- msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
+ msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
- *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
+ *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
}
}
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i *)src);
- cache_prefetch ((__m128i *)dst);
- cache_prefetch ((__m128i *)mask);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i *)src);
- cache_prefetch_next ((__m128i *)dst);
- cache_prefetch_next ((__m128i *)mask);
-
m = *(uint32_t *) mask;
if (m)
expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
-
+
in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
&xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
}
else
{
- __m64 ms, md, ma, msa;
+ __m128i ms, md, ma, msa;
- ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
- ms = unpack_32_1x64 (s);
- md = unpack_32_1x64 (d);
+ ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+ ms = unpack_32_1x128 (s);
+ md = unpack_32_1x128 (d);
- msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
+ msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
- *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
+ *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
}
}
}
}
- _mm_empty ();
}
-static const pixman_fast_path_t sse2_fast_paths[] =
+static void
+sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
{
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, sse2_composite_over_n_8_0565, 0 },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, sse2_composite_over_n_8_0565, 0 },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_n_0565, 0 },
- { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_8888_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_8888_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_over_8888_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_over_8888_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_8888_0565, 0 },
- { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_over_8888_0565, 0 },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_8888_8_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_8888_8_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_8888_8_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_8888_8_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_x888_8_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_solid, PIXMAN_a8r8g8b8, sse2_composite_over_x888_n_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_solid, PIXMAN_x8r8g8b8, sse2_composite_over_x888_n_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_solid, PIXMAN_a8b8g8r8, sse2_composite_over_x888_n_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_solid, PIXMAN_x8b8g8r8, sse2_composite_over_x888_n_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_solid, PIXMAN_a8r8g8b8, sse2_composite_over_8888_n_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_solid, PIXMAN_x8r8g8b8, sse2_composite_over_8888_n_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_solid, PIXMAN_a8b8g8r8, sse2_composite_over_8888_n_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_solid, PIXMAN_x8b8g8r8, sse2_composite_over_8888_n_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
- { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
- { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
- { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
- { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
- { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
- { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
- { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
- { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF },
- { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
- { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
- { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
- { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF },
- { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
- { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
-
- { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_add_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
- { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_add_8000_8000, 0 },
- { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_add_8888_8888, 0 },
- { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_add_8888_8888, 0 },
- { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_composite_add_n_8_8, 0 },
-
- { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_src_n_8_8888, 0 },
- { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_src_n_8_8888, 0 },
- { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_src_n_8_8888, 0 },
- { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_src_n_8_8888, 0 },
- { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_copy_area, 0 },
- { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_copy_area, 0 },
- { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
- { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
- { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 },
- { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 },
- { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_copy_area, 0 },
- { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_copy_area, 0 },
-
- { PIXMAN_OP_IN, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_in_8_8, 0 },
- { PIXMAN_OP_IN, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_composite_in_n_8_8, 0 },
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t src;
+ uint32_t *dst_line, *dst;
+ __m128i xmm_src;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_dsta_hi, xmm_dsta_lo;
+ int dst_stride;
+ int32_t w;
- { PIXMAN_OP_NONE },
-};
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
-/*
- * Work around GCC bug causing crashes in Mozilla with SSE2
- *
- * When using -msse, gcc generates movdqa instructions assuming that
- * the stack is 16 byte aligned. Unfortunately some applications, such
- * as Mozilla and Mono, end up aligning the stack to 4 bytes, which
- * causes the movdqa instructions to fail.
- *
- * The __force_align_arg_pointer__ makes gcc generate a prologue that
- * realigns the stack pointer to 16 bytes.
- *
- * On x86-64 this is not necessary because the standard ABI already
- * calls for a 16 byte aligned stack.
- *
- * See https://bugs.freedesktop.org/show_bug.cgi?id=15693
- */
-#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
-__attribute__((__force_align_arg_pointer__))
-#endif
-static void
-sse2_composite (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * src,
- pixman_image_t * mask,
- pixman_image_t * dest,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- if (_pixman_run_fast_path (sse2_fast_paths, imp,
- op, src, mask, dest,
- src_x, src_y,
- mask_x, mask_y,
- dest_x, dest_y,
- width, height))
- {
+ if (src == 0)
return;
- }
- _pixman_implementation_composite (imp->delegate, op,
- src, mask, dest,
- src_x, src_y,
- mask_x, mask_y,
- dest_x, dest_y,
- width, height);
-}
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
-__attribute__((__force_align_arg_pointer__))
-#endif
-static pixman_bool_t
-sse2_blt (pixman_implementation_t *imp,
- uint32_t * src_bits,
- uint32_t * dst_bits,
- int src_stride,
- int dst_stride,
- int src_bpp,
- int dst_bpp,
- int src_x,
- int src_y,
- int dst_x,
- int dst_y,
- int width,
- int height)
-{
- if (!pixman_blt_sse2 (
- src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
- src_x, src_y, dst_x, dst_y, width, height))
+ xmm_src = expand_pixel_32_1x128 (src);
+ while (height--)
{
- return _pixman_implementation_blt (
- imp->delegate,
- src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
- src_x, src_y, dst_x, dst_y, width, height);
- }
+ dst = dst_line;
- return TRUE;
-}
+ dst_line += dst_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ __m128i vd;
+
+ vd = unpack_32_1x128 (*dst);
+
+ *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
+ xmm_src));
+ w--;
+ dst++;
+ }
+
+ while (w >= 4)
+ {
+ __m128i tmp_lo, tmp_hi;
+
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
+
+ tmp_lo = xmm_src;
+ tmp_hi = xmm_src;
+
+ over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_dsta_lo, &xmm_dsta_hi,
+ &tmp_lo, &tmp_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
+
+ w -= 4;
+ dst += 4;
+ }
+
+ while (w)
+ {
+ __m128i vd;
+
+ vd = unpack_32_1x128 (*dst);
+
+ *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
+ xmm_src));
+ w--;
+ dst++;
+ }
-#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
-__attribute__((__force_align_arg_pointer__))
-#endif
-static pixman_bool_t
-sse2_fill (pixman_implementation_t *imp,
- uint32_t * bits,
- int stride,
- int bpp,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
- {
- return _pixman_implementation_fill (
- imp->delegate, bits, stride, bpp, x, y, width, height, xor);
}
- return TRUE;
}
-#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
-__attribute__((__force_align_arg_pointer__))
-#endif
-pixman_implementation_t *
-_pixman_implementation_create_sse2 (void)
+static void
+sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
{
- pixman_implementation_t *mmx = _pixman_implementation_create_mmx ();
- pixman_implementation_t *imp = _pixman_implementation_create (mmx);
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t *src, *src_line, s;
+ uint32_t *dst, *dst_line, d;
+ uint32_t *mask, *mask_line;
+ uint32_t m;
+ int src_stride, mask_stride, dst_stride;
+ int32_t w;
- /* SSE2 constants */
- mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
- mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
- mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
- mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
- mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
- mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
- mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
- mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
- mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
- mask_0080 = create_mask_16_128 (0x0080);
- mask_00ff = create_mask_16_128 (0x00ff);
- mask_0101 = create_mask_16_128 (0x0101);
- mask_ffff = create_mask_16_128 (0xffff);
- mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
- mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- /* MMX constants */
- mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
- mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
- mask_x0080 = create_mask_16_64 (0x0080);
- mask_x00ff = create_mask_16_64 (0x00ff);
- mask_x0101 = create_mask_16_64 (0x0101);
- mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
+ while (height--)
+ {
+ src = src_line;
+ src_line += src_stride;
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
- _mm_empty ();
+ w = width;
- /* Set up function pointers */
+ while (w && (unsigned long)dst & 15)
+ {
+ uint32_t sa;
- /* SSE code patch for fbcompose.c */
- imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
- imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
- imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
- imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
- imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
- imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
- imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
- imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
- imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
- imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
+ s = *src++;
+ m = (*mask++) >> 24;
+ d = *dst;
- imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
+ sa = s >> 24;
- imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
- imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
- imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
- imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
- imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
- imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
- imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
- imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
- imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
- imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
- imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
+ if (m)
+ {
+ if (sa == 0xff && m == 0xff)
+ {
+ *dst = s;
+ }
+ else
+ {
+ __m128i ms, md, ma, msa;
- imp->composite = sse2_composite;
- imp->blt = sse2_blt;
- imp->fill = sse2_fill;
+ ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+ ms = unpack_32_1x128 (s);
+ md = unpack_32_1x128 (d);
- return imp;
-}
+ msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+ *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+ }
+ }
-#endif /* USE_SSE2 */
+ dst++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_mask = load_128_unaligned ((__m128i*)mask);
+
+ if (!is_transparent (xmm_mask))
+ {
+ xmm_src = load_128_unaligned ((__m128i*)src);
+
+ if (is_opaque (xmm_mask) && is_opaque (xmm_src))
+ {
+ save_128_aligned ((__m128i *)dst, xmm_src);
+ }
+ else
+ {
+ xmm_dst = load_128_aligned ((__m128i *)dst);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+ expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+ &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+ }
+
+ src += 4;
+ dst += 4;
+ mask += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ uint32_t sa;
+
+ s = *src++;
+ m = (*mask++) >> 24;
+ d = *dst;
+
+ sa = s >> 24;
+
+ if (m)
+ {
+ if (sa == 0xff && m == 0xff)
+ {
+ *dst = s;
+ }
+ else
+ {
+ __m128i ms, md, ma, msa;
+
+ ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+ ms = unpack_32_1x128 (s);
+ md = unpack_32_1x128 (d);
+
+ msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+ *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+ }
+ }
+
+ dst++;
+ w--;
+ }
+ }
+
+}
+
+/* A variant of 'sse2_combine_over_u' with minor tweaks */
+static force_inline void
+scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
+ const uint32_t* ps,
+ int32_t w,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t fully_transparent_src)
+{
+ uint32_t s, d;
+ const uint32_t* pm = NULL;
+
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+ if (fully_transparent_src)
+ return;
+
+ /* Align dst on a 16-byte boundary */
+ while (w && ((unsigned long)pd & 15))
+ {
+ d = *pd;
+ s = combine1 (ps + (vx >> 16), pm);
+ vx += unit_x;
+
+ *pd++ = core_combine_over_u_pixel_sse2 (s, d);
+ if (pm)
+ pm++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ __m128i tmp;
+ uint32_t tmp1, tmp2, tmp3, tmp4;
+
+ tmp1 = ps[vx >> 16];
+ vx += unit_x;
+ tmp2 = ps[vx >> 16];
+ vx += unit_x;
+ tmp3 = ps[vx >> 16];
+ vx += unit_x;
+ tmp4 = ps[vx >> 16];
+ vx += unit_x;
+
+ tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
+
+ xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
+
+ if (is_opaque (xmm_src_hi))
+ {
+ save_128_aligned ((__m128i*)pd, xmm_src_hi);
+ }
+ else if (!is_zero (xmm_src_hi))
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (
+ xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+
+ over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ /* rebuid the 4 pixel data and save*/
+ save_128_aligned ((__m128i*)pd,
+ pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ w -= 4;
+ pd += 4;
+ if (pm)
+ pm += 4;
+ }
+
+ while (w)
+ {
+ d = *pd;
+ s = combine1 (ps + (vx >> 16), pm);
+ vx += unit_x;
+
+ *pd++ = core_combine_over_u_pixel_sse2 (s, d);
+ if (pm)
+ pm++;
+
+ w--;
+ }
+}
+
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
+ scaled_nearest_scanline_sse2_8888_8888_OVER,
+ uint32_t, uint32_t, COVER)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
+ scaled_nearest_scanline_sse2_8888_8888_OVER,
+ uint32_t, uint32_t, NONE)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
+ scaled_nearest_scanline_sse2_8888_8888_OVER,
+ uint32_t, uint32_t, PAD)
+
+static force_inline void
+scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
+ uint32_t * dst,
+ const uint32_t * src,
+ int32_t w,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t zero_src)
+{
+ __m128i xmm_mask;
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+ if (zero_src || (*mask >> 24) == 0)
+ return;
+
+ xmm_mask = create_mask_16_128 (*mask >> 24);
+
+ while (w && (unsigned long)dst & 15)
+ {
+ uint32_t s = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+
+ if (s)
+ {
+ uint32_t d = *dst;
+
+ __m128i ms = unpack_32_1x128 (s);
+ __m128i alpha = expand_alpha_1x128 (ms);
+ __m128i dest = xmm_mask;
+ __m128i alpha_dst = unpack_32_1x128 (d);
+
+ *dst = pack_1x128_32 (
+ in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+ }
+ dst++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ uint32_t tmp1, tmp2, tmp3, tmp4;
+
+ tmp1 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+ tmp2 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+ tmp3 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+ tmp4 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+
+ xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
+
+ if (!is_zero (xmm_src))
+ {
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_mask, &xmm_mask,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ dst += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ uint32_t s = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+
+ if (s)
+ {
+ uint32_t d = *dst;
+
+ __m128i ms = unpack_32_1x128 (s);
+ __m128i alpha = expand_alpha_1x128 (ms);
+ __m128i mask = xmm_mask;
+ __m128i dest = unpack_32_1x128 (d);
+
+ *dst = pack_1x128_32 (
+ in_over_1x128 (&ms, &alpha, &mask, &dest));
+ }
+
+ dst++;
+ w--;
+ }
+
+}
+
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
+ scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+ uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
+ scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+ uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
+ scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+ uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
+
+#define BILINEAR_DECLARE_VARIABLES \
+ const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \
+ const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \
+ const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);\
+ const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1); \
+ const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x, \
+ unit_x, unit_x, unit_x, unit_x); \
+ const __m128i xmm_zero = _mm_setzero_si128 (); \
+ __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx)
+
+#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \
+do { \
+ __m128i xmm_wh, xmm_lo, xmm_hi, a; \
+ /* fetch 2x2 pixel block into sse2 register */ \
+ uint32_t tl = src_top [pixman_fixed_to_int (vx)]; \
+ uint32_t tr = src_top [pixman_fixed_to_int (vx) + 1]; \
+ uint32_t bl = src_bottom [pixman_fixed_to_int (vx)]; \
+ uint32_t br = src_bottom [pixman_fixed_to_int (vx) + 1]; \
+ a = _mm_set_epi32 (tr, tl, br, bl); \
+ vx += unit_x; \
+ /* vertical interpolation */ \
+ a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero), \
+ xmm_wt), \
+ _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero), \
+ xmm_wb)); \
+ /* calculate horizontal weights */ \
+ xmm_wh = _mm_add_epi16 (xmm_addc, \
+ _mm_xor_si128 (xmm_xorc, \
+ _mm_srli_epi16 (xmm_x, 8))); \
+ xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
+ /* horizontal interpolation */ \
+ xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \
+ xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \
+ a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \
+ _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \
+ /* shift and pack the result */ \
+ a = _mm_srli_epi32 (a, 16); \
+ a = _mm_packs_epi32 (a, a); \
+ a = _mm_packus_epi16 (a, a); \
+ pix = _mm_cvtsi128_si32 (a); \
+} while (0)
+
+#define BILINEAR_SKIP_ONE_PIXEL() \
+do { \
+ vx += unit_x; \
+ xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
+} while(0)
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst,
+ const uint32_t * mask,
+ const uint32_t * src_top,
+ const uint32_t * src_bottom,
+ int32_t w,
+ int wt,
+ int wb,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t zero_src)
+{
+ BILINEAR_DECLARE_VARIABLES;
+ uint32_t pix1, pix2, pix3, pix4;
+
+ while ((w -= 4) >= 0)
+ {
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+ *dst++ = pix1;
+ *dst++ = pix2;
+ *dst++ = pix3;
+ *dst++ = pix4;
+ }
+
+ if (w & 2)
+ {
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+ *dst++ = pix1;
+ *dst++ = pix2;
+ }
+
+ if (w & 1)
+ {
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+ *dst = pix1;
+ }
+
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
+ scaled_bilinear_scanline_sse2_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
+ scaled_bilinear_scanline_sse2_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
+ scaled_bilinear_scanline_sse2_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
+ scaled_bilinear_scanline_sse2_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ NORMAL, FLAG_NONE)
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst,
+ const uint32_t * mask,
+ const uint32_t * src_top,
+ const uint32_t * src_bottom,
+ int32_t w,
+ int wt,
+ int wb,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t zero_src)
+{
+ BILINEAR_DECLARE_VARIABLES;
+ uint32_t pix1, pix2, pix3, pix4;
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+
+ if (pix1)
+ {
+ pix2 = *dst;
+ *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
+ }
+
+ w--;
+ dst++;
+ }
+
+ while (w >= 4)
+ {
+ __m128i xmm_src;
+ __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
+ __m128i xmm_alpha_hi, xmm_alpha_lo;
+
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+
+ xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
+
+ if (!is_zero (xmm_src))
+ {
+ if (is_opaque (xmm_src))
+ {
+ save_128_aligned ((__m128i *)dst, xmm_src);
+ }
+ else
+ {
+ __m128i xmm_dst = load_128_aligned ((__m128i *)dst);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+ over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+ }
+
+ w -= 4;
+ dst += 4;
+ }
+
+ while (w)
+ {
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+
+ if (pix1)
+ {
+ pix2 = *dst;
+ *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
+ }
+
+ w--;
+ dst++;
+ }
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
+ scaled_bilinear_scanline_sse2_8888_8888_OVER,
+ uint32_t, uint32_t, uint32_t,
+ COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
+ scaled_bilinear_scanline_sse2_8888_8888_OVER,
+ uint32_t, uint32_t, uint32_t,
+ PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
+ scaled_bilinear_scanline_sse2_8888_8888_OVER,
+ uint32_t, uint32_t, uint32_t,
+ NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
+ scaled_bilinear_scanline_sse2_8888_8888_OVER,
+ uint32_t, uint32_t, uint32_t,
+ NORMAL, FLAG_NONE)
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst,
+ const uint8_t * mask,
+ const uint32_t * src_top,
+ const uint32_t * src_bottom,
+ int32_t w,
+ int wt,
+ int wb,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t zero_src)
+{
+ BILINEAR_DECLARE_VARIABLES;
+ uint32_t pix1, pix2, pix3, pix4;
+ uint32_t m;
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ uint32_t sa;
+
+ m = (uint32_t) *mask++;
+
+ if (m)
+ {
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+ sa = pix1 >> 24;
+
+ if (sa == 0xff && m == 0xff)
+ {
+ *dst = pix1;
+ }
+ else
+ {
+ __m128i ms, md, ma, msa;
+
+ pix2 = *dst;
+ ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+ ms = unpack_32_1x128 (pix1);
+ md = unpack_32_1x128 (pix2);
+
+ msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+ *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+ }
+ }
+ else
+ {
+ BILINEAR_SKIP_ONE_PIXEL ();
+ }
+
+ w--;
+ dst++;
+ }
+
+ while (w >= 4)
+ {
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+ m = *(uint32_t*)mask;
+
+ if (m)
+ {
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+
+ xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
+
+ if (m == 0xffffffff && is_opaque (xmm_src))
+ {
+ save_128_aligned ((__m128i *)dst, xmm_src);
+ }
+ else
+ {
+ xmm_dst = load_128_aligned ((__m128i *)dst);
+
+ xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+ &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+ }
+ else
+ {
+ BILINEAR_SKIP_ONE_PIXEL ();
+ BILINEAR_SKIP_ONE_PIXEL ();
+ BILINEAR_SKIP_ONE_PIXEL ();
+ BILINEAR_SKIP_ONE_PIXEL ();
+ }
+
+ w -= 4;
+ dst += 4;
+ mask += 4;
+ }
+
+ while (w)
+ {
+ uint32_t sa;
+
+ m = (uint32_t) *mask++;
+
+ if (m)
+ {
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+ sa = pix1 >> 24;
+
+ if (sa == 0xff && m == 0xff)
+ {
+ *dst = pix1;
+ }
+ else
+ {
+ __m128i ms, md, ma, msa;
+
+ pix2 = *dst;
+ ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+ ms = unpack_32_1x128 (pix1);
+ md = unpack_32_1x128 (pix2);
+
+ msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+ *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+ }
+ }
+ else
+ {
+ BILINEAR_SKIP_ONE_PIXEL ();
+ }
+
+ w--;
+ dst++;
+ }
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
+ scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+ uint32_t, uint8_t, uint32_t,
+ COVER, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
+ scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+ uint32_t, uint8_t, uint32_t,
+ PAD, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
+ scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+ uint32_t, uint8_t, uint32_t,
+ NONE, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
+ scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+ uint32_t, uint8_t, uint32_t,
+ NORMAL, FLAG_HAVE_NON_SOLID_MASK)
+
+static const pixman_fast_path_t sse2_fast_paths[] =
+{
+ /* PIXMAN_OP_OVER */
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
+ PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
+ PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
+ PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
+ PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
+ PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
+ PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
+ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+
+ /* PIXMAN_OP_OVER_REVERSE */
+ PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
+
+ /* PIXMAN_OP_ADD */
+ PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
+ PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
+ PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
+ PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
+
+ /* PIXMAN_OP_SRC */
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
+
+ /* PIXMAN_OP_IN */
+ PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
+ PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
+ PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
+
+ SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
+
+ SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
+
+ { PIXMAN_OP_NONE },
+};
+
+static pixman_bool_t
+sse2_blt (pixman_implementation_t *imp,
+ uint32_t * src_bits,
+ uint32_t * dst_bits,
+ int src_stride,
+ int dst_stride,
+ int src_bpp,
+ int dst_bpp,
+ int src_x,
+ int src_y,
+ int dest_x,
+ int dest_y,
+ int width,
+ int height)
+{
+ if (!pixman_blt_sse2 (
+ src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+ src_x, src_y, dest_x, dest_y, width, height))
+
+ {
+ return _pixman_implementation_blt (
+ imp->delegate,
+ src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+ src_x, src_y, dest_x, dest_y, width, height);
+ }
+
+ return TRUE;
+}
+
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+static pixman_bool_t
+sse2_fill (pixman_implementation_t *imp,
+ uint32_t * bits,
+ int stride,
+ int bpp,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t xor)
+{
+ if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
+ {
+ return _pixman_implementation_fill (
+ imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+ }
+
+ return TRUE;
+}
+
+static uint32_t *
+sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+ int w = iter->width;
+ __m128i ff000000 = mask_ff000000;
+ uint32_t *dst = iter->buffer;
+ uint32_t *src = (uint32_t *)iter->bits;
+
+ iter->bits += iter->stride;
+
+ while (w && ((unsigned long)dst) & 0x0f)
+ {
+ *dst++ = (*src++) | 0xff000000;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ save_128_aligned (
+ (__m128i *)dst, _mm_or_si128 (
+ load_128_unaligned ((__m128i *)src), ff000000));
+
+ dst += 4;
+ src += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ *dst++ = (*src++) | 0xff000000;
+ w--;
+ }
+
+ return iter->buffer;
+}
+
+static uint32_t *
+sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
+{
+ int w = iter->width;
+ uint32_t *dst = iter->buffer;
+ uint16_t *src = (uint16_t *)iter->bits;
+ __m128i ff000000 = mask_ff000000;
+
+ iter->bits += iter->stride;
+
+ while (w && ((unsigned long)dst) & 0x0f)
+ {
+ uint16_t s = *src++;
+
+ *dst++ = CONVERT_0565_TO_8888 (s);
+ w--;
+ }
+
+ while (w >= 8)
+ {
+ __m128i lo, hi, s;
+
+ s = _mm_loadu_si128 ((__m128i *)src);
+
+ lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
+ hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
+
+ save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
+ save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
+
+ dst += 8;
+ src += 8;
+ w -= 8;
+ }
+
+ while (w)
+ {
+ uint16_t s = *src++;
+
+ *dst++ = CONVERT_0565_TO_8888 (s);
+ w--;
+ }
+
+ return iter->buffer;
+}
+
+static uint32_t *
+sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+ int w = iter->width;
+ uint32_t *dst = iter->buffer;
+ uint8_t *src = iter->bits;
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
+
+ iter->bits += iter->stride;
+
+ while (w && (((unsigned long)dst) & 15))
+ {
+ *dst++ = *(src++) << 24;
+ w--;
+ }
+
+ while (w >= 16)
+ {
+ xmm0 = _mm_loadu_si128((__m128i *)src);
+
+ xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0);
+ xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0);
+ xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
+ xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
+ xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
+ xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
+
+ _mm_store_si128(((__m128i *)(dst + 0)), xmm3);
+ _mm_store_si128(((__m128i *)(dst + 4)), xmm4);
+ _mm_store_si128(((__m128i *)(dst + 8)), xmm5);
+ _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
+
+ dst += 16;
+ src += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ *dst++ = *(src++) << 24;
+ w--;
+ }
+
+ return iter->buffer;
+}
+
+typedef struct
+{
+ pixman_format_code_t format;
+ pixman_iter_get_scanline_t get_scanline;
+} fetcher_info_t;
+
+static const fetcher_info_t fetchers[] =
+{
+ { PIXMAN_x8r8g8b8, sse2_fetch_x8r8g8b8 },
+ { PIXMAN_r5g6b5, sse2_fetch_r5g6b5 },
+ { PIXMAN_a8, sse2_fetch_a8 },
+ { PIXMAN_null }
+};
+
+static void
+sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+ pixman_image_t *image = iter->image;
+ int x = iter->x;
+ int y = iter->y;
+ int width = iter->width;
+ int height = iter->height;
+
+#define FLAGS \
+ (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | FAST_PATH_BITS_IMAGE)
+
+ if ((iter->flags & ITER_NARROW) &&
+ (image->common.flags & FLAGS) == FLAGS &&
+ x >= 0 && y >= 0 &&
+ x + width <= image->bits.width &&
+ y + height <= image->bits.height)
+ {
+ const fetcher_info_t *f;
+
+ for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
+ {
+ if (image->common.extended_format_code == f->format)
+ {
+ uint8_t *b = (uint8_t *)image->bits.bits;
+ int s = image->bits.rowstride * 4;
+
+ iter->bits = b + s * iter->y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
+ iter->stride = s;
+
+ iter->get_scanline = f->get_scanline;
+ return;
+ }
+ }
+ }
+
+ imp->delegate->src_iter_init (imp->delegate, iter);
+}
+
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+pixman_implementation_t *
+_pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
+{
+ pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
+
+ /* SSE2 constants */
+ mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
+ mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
+ mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
+ mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
+ mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
+ mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
+ mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
+ mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
+ mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
+ mask_0080 = create_mask_16_128 (0x0080);
+ mask_00ff = create_mask_16_128 (0x00ff);
+ mask_0101 = create_mask_16_128 (0x0101);
+ mask_ffff = create_mask_16_128 (0xffff);
+ mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
+ mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
+
+ /* Set up function pointers */
+ imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
+ imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
+ imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
+ imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
+ imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
+ imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
+ imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
+ imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
+ imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
+ imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
+
+ imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
+
+ imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
+ imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
+ imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
+ imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
+ imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
+ imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
+ imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
+
+ imp->blt = sse2_blt;
+ imp->fill = sse2_fill;
+
+ imp->src_iter_init = sse2_src_iter_init;
+
+ return imp;
+}