*
* Author: Rodrigo Kumpera (kumpera@gmail.com)
* André Tupinambá (andrelrt@gmail.com)
- *
+ *
* Based on work by Owen Taylor and Søren Sandmann
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
-#include <mmintrin.h>
#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
#include <emmintrin.h> /* for SSE2 intrinsics */
#include "pixman-private.h"
#include "pixman-combine32.h"
+#include "pixman-inlines.h"
-#ifdef USE_SSE2
+static __m128i mask_0080;
+static __m128i mask_00ff;
+static __m128i mask_0101;
+static __m128i mask_ffff;
+static __m128i mask_ff000000;
+static __m128i mask_alpha;
-/* -------------------------------------------------------------------------------------------------
- * Locals
- */
+static __m128i mask_565_r;
+static __m128i mask_565_g1, mask_565_g2;
+static __m128i mask_565_b;
+static __m128i mask_red;
+static __m128i mask_green;
+static __m128i mask_blue;
+
+static __m128i mask_565_fix_rb;
+static __m128i mask_565_fix_g;
-static __m64 xMask0080;
-static __m64 xMask00ff;
-static __m64 xMask0101;
-static __m64 xMaskAlpha;
-
-static __m64 xMask565rgb;
-static __m64 xMask565Unpack;
-
-static __m128i Mask0080;
-static __m128i Mask00ff;
-static __m128i Mask0101;
-static __m128i Maskffff;
-static __m128i Maskff000000;
-static __m128i MaskAlpha;
-
-static __m128i Mask565r;
-static __m128i Mask565g1, Mask565g2;
-static __m128i Mask565b;
-static __m128i MaskRed;
-static __m128i MaskGreen;
-static __m128i MaskBlue;
-
-static __m128i Mask565FixRB;
-static __m128i Mask565FixG;
-
-/* -------------------------------------------------------------------------------------------------
- * SSE2 Inlines
- */
static force_inline __m128i
unpack_32_1x128 (uint32_t data)
{
- return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128());
+ return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
}
static force_inline void
-unpack_128_2x128 (__m128i data, __m128i* dataLo, __m128i* dataHi)
+unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
{
- *dataLo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
- *dataHi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
+ *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
+ *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
}
static force_inline __m128i
-unpack565to8888 (__m128i lo)
+unpack_565_to_8888 (__m128i lo)
{
__m128i r, g, b, rb, t;
-
- r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), MaskRed);
- g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), MaskGreen);
- b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), MaskBlue);
+
+ r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
+ g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
+ b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
rb = _mm_or_si128 (r, b);
- t = _mm_and_si128 (rb, Mask565FixRB);
+ t = _mm_and_si128 (rb, mask_565_fix_rb);
t = _mm_srli_epi32 (t, 5);
rb = _mm_or_si128 (rb, t);
- t = _mm_and_si128 (g, Mask565FixG);
+ t = _mm_and_si128 (g, mask_565_fix_g);
t = _mm_srli_epi32 (t, 6);
g = _mm_or_si128 (g, t);
-
+
return _mm_or_si128 (rb, g);
}
static force_inline void
-unpack565_128_4x128 (__m128i data, __m128i* data0, __m128i* data1, __m128i* data2, __m128i* data3)
+unpack_565_128_4x128 (__m128i data,
+ __m128i* data0,
+ __m128i* data1,
+ __m128i* data2,
+ __m128i* data3)
{
__m128i lo, hi;
lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
- lo = unpack565to8888 (lo);
- hi = unpack565to8888 (hi);
+ lo = unpack_565_to_8888 (lo);
+ hi = unpack_565_to_8888 (hi);
unpack_128_2x128 (lo, data0, data1);
unpack_128_2x128 (hi, data2, data3);
}
static force_inline uint16_t
-pack565_32_16 (uint32_t pixel)
+pack_565_32_16 (uint32_t pixel)
{
- return (uint16_t) (((pixel>>8) & 0xf800) | ((pixel>>5) & 0x07e0) | ((pixel>>3) & 0x001f));
+ return (uint16_t) (((pixel >> 8) & 0xf800) |
+ ((pixel >> 5) & 0x07e0) |
+ ((pixel >> 3) & 0x001f));
}
static force_inline __m128i
}
static force_inline __m128i
-pack565_2x128_128 (__m128i lo, __m128i hi)
+pack_565_2x128_128 (__m128i lo, __m128i hi)
{
__m128i data;
__m128i r, g1, g2, b;
- data = pack_2x128_128 ( lo, hi );
+ data = pack_2x128_128 (lo, hi);
- r = _mm_and_si128 (data , Mask565r);
- g1 = _mm_and_si128 (_mm_slli_epi32 (data , 3), Mask565g1);
- g2 = _mm_and_si128 (_mm_srli_epi32 (data , 5), Mask565g2);
- b = _mm_and_si128 (_mm_srli_epi32 (data , 3), Mask565b);
+ r = _mm_and_si128 (data, mask_565_r);
+ g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
+ g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
+ b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
}
static force_inline __m128i
-pack565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
+pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
{
- return _mm_packus_epi16 (pack565_2x128_128 (*xmm0, *xmm1), pack565_2x128_128 (*xmm2, *xmm3));
+ return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
+ pack_565_2x128_128 (*xmm2, *xmm3));
}
static force_inline int
-isOpaque (__m128i x)
+is_opaque (__m128i x)
{
__m128i ffs = _mm_cmpeq_epi8 (x, x);
+
return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
}
static force_inline int
-isZero (__m128i x)
+is_zero (__m128i x)
{
- return _mm_movemask_epi8 (_mm_cmpeq_epi8 (x, _mm_setzero_si128())) == 0xffff;
+ return _mm_movemask_epi8 (
+ _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
}
static force_inline int
-isTransparent (__m128i x)
+is_transparent (__m128i x)
{
- return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, _mm_setzero_si128())) & 0x8888) == 0x8888;
+ return (_mm_movemask_epi8 (
+ _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
}
static force_inline __m128i
-expandPixel_32_1x128 (uint32_t data)
+expand_pixel_32_1x128 (uint32_t data)
{
- return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE(1, 0, 1, 0));
+ return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
}
static force_inline __m128i
-expandAlpha_1x128 (__m128i data)
+expand_alpha_1x128 (__m128i data)
{
- return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
+ return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
+ _MM_SHUFFLE (3, 3, 3, 3)),
+ _MM_SHUFFLE (3, 3, 3, 3));
}
static force_inline void
-expandAlpha_2x128 (__m128i dataLo, __m128i dataHi, __m128i* alphaLo, __m128i* alphaHi)
+expand_alpha_2x128 (__m128i data_lo,
+ __m128i data_hi,
+ __m128i* alpha_lo,
+ __m128i* alpha_hi)
{
__m128i lo, hi;
- lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 3, 3, 3));
- hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 3, 3, 3));
- *alphaLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 3, 3, 3));
- *alphaHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 3, 3, 3));
+ lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
+ hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
+
+ *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
+ *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
}
static force_inline void
-expandAlphaRev_2x128 (__m128i dataLo, __m128i dataHi, __m128i* alphaLo, __m128i* alphaHi)
+expand_alpha_rev_2x128 (__m128i data_lo,
+ __m128i data_hi,
+ __m128i* alpha_lo,
+ __m128i* alpha_hi)
{
__m128i lo, hi;
- lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(0, 0, 0, 0));
- hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(0, 0, 0, 0));
- *alphaLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(0, 0, 0, 0));
- *alphaHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(0, 0, 0, 0));
+ lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
+ hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
+ *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
+ *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
}
static force_inline void
-pixMultiply_2x128 (__m128i* dataLo, __m128i* dataHi, __m128i* alphaLo, __m128i* alphaHi, __m128i* retLo, __m128i* retHi)
+pix_multiply_2x128 (__m128i* data_lo,
+ __m128i* data_hi,
+ __m128i* alpha_lo,
+ __m128i* alpha_hi,
+ __m128i* ret_lo,
+ __m128i* ret_hi)
{
__m128i lo, hi;
- lo = _mm_mullo_epi16 (*dataLo, *alphaLo);
- hi = _mm_mullo_epi16 (*dataHi, *alphaHi);
- lo = _mm_adds_epu16 (lo, Mask0080);
- hi = _mm_adds_epu16 (hi, Mask0080);
- *retLo = _mm_mulhi_epu16 (lo, Mask0101);
- *retHi = _mm_mulhi_epu16 (hi, Mask0101);
+ lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
+ hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
+ lo = _mm_adds_epu16 (lo, mask_0080);
+ hi = _mm_adds_epu16 (hi, mask_0080);
+ *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
+ *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
}
static force_inline void
-pixAddMultiply_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaDstLo, __m128i* alphaDstHi,
- __m128i* dstLo, __m128i* dstHi, __m128i* alphaSrcLo, __m128i* alphaSrcHi,
- __m128i* retLo, __m128i* retHi)
+pix_add_multiply_2x128 (__m128i* src_lo,
+ __m128i* src_hi,
+ __m128i* alpha_dst_lo,
+ __m128i* alpha_dst_hi,
+ __m128i* dst_lo,
+ __m128i* dst_hi,
+ __m128i* alpha_src_lo,
+ __m128i* alpha_src_hi,
+ __m128i* ret_lo,
+ __m128i* ret_hi)
{
- __m128i lo, hi;
- __m128i mulLo, mulHi;
-
- lo = _mm_mullo_epi16 (*srcLo, *alphaDstLo);
- hi = _mm_mullo_epi16 (*srcHi, *alphaDstHi);
- mulLo = _mm_mullo_epi16 (*dstLo, *alphaSrcLo);
- mulHi = _mm_mullo_epi16 (*dstHi, *alphaSrcHi);
- lo = _mm_adds_epu16 (lo, Mask0080);
- hi = _mm_adds_epu16 (hi, Mask0080);
- lo = _mm_adds_epu16 (lo, mulLo);
- hi = _mm_adds_epu16 (hi, mulHi);
- *retLo = _mm_mulhi_epu16 (lo, Mask0101);
- *retHi = _mm_mulhi_epu16 (hi, Mask0101);
+ __m128i t1_lo, t1_hi;
+ __m128i t2_lo, t2_hi;
+
+ pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
+ pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
+
+ *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
+ *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
}
static force_inline void
-negate_2x128 (__m128i dataLo, __m128i dataHi, __m128i* negLo, __m128i* negHi)
+negate_2x128 (__m128i data_lo,
+ __m128i data_hi,
+ __m128i* neg_lo,
+ __m128i* neg_hi)
{
- *negLo = _mm_xor_si128 (dataLo, Mask00ff);
- *negHi = _mm_xor_si128 (dataHi, Mask00ff);
+ *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
+ *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
}
static force_inline void
-invertColors_2x128 (__m128i dataLo, __m128i dataHi, __m128i* invLo, __m128i* invHi)
+invert_colors_2x128 (__m128i data_lo,
+ __m128i data_hi,
+ __m128i* inv_lo,
+ __m128i* inv_hi)
{
__m128i lo, hi;
- lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 0, 1, 2));
- hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 0, 1, 2));
- *invLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 0, 1, 2));
- *invHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 0, 1, 2));
+ lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
+ hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
+ *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
+ *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
}
static force_inline void
-over_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaLo, __m128i* alphaHi, __m128i* dstLo, __m128i* dstHi)
+over_2x128 (__m128i* src_lo,
+ __m128i* src_hi,
+ __m128i* alpha_lo,
+ __m128i* alpha_hi,
+ __m128i* dst_lo,
+ __m128i* dst_hi)
{
__m128i t1, t2;
- negate_2x128 (*alphaLo, *alphaHi, &t1, &t2);
+ negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
- pixMultiply_2x128 (dstLo, dstHi, &t1, &t2, dstLo, dstHi);
+ pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
- *dstLo = _mm_adds_epu8 (*srcLo, *dstLo);
- *dstHi = _mm_adds_epu8 (*srcHi, *dstHi);
+ *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
+ *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
}
static force_inline void
-overRevNonPre_2x128 (__m128i srcLo, __m128i srcHi, __m128i* dstLo, __m128i* dstHi)
+over_rev_non_pre_2x128 (__m128i src_lo,
+ __m128i src_hi,
+ __m128i* dst_lo,
+ __m128i* dst_hi)
{
__m128i lo, hi;
- __m128i alphaLo, alphaHi;
+ __m128i alpha_lo, alpha_hi;
- expandAlpha_2x128 (srcLo, srcHi, &alphaLo, &alphaHi);
+ expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
- lo = _mm_or_si128 (alphaLo, MaskAlpha);
- hi = _mm_or_si128 (alphaHi, MaskAlpha);
+ lo = _mm_or_si128 (alpha_lo, mask_alpha);
+ hi = _mm_or_si128 (alpha_hi, mask_alpha);
- invertColors_2x128 (srcLo, srcHi, &srcLo, &srcHi);
+ invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
- pixMultiply_2x128 (&srcLo, &srcHi, &lo, &hi, &lo, &hi);
+ pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
- over_2x128 (&lo, &hi, &alphaLo, &alphaHi, dstLo, dstHi);
+ over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
}
static force_inline void
-inOver_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaLo, __m128i* alphaHi,
- __m128i* maskLo, __m128i* maskHi, __m128i* dstLo, __m128i* dstHi)
+in_over_2x128 (__m128i* src_lo,
+ __m128i* src_hi,
+ __m128i* alpha_lo,
+ __m128i* alpha_hi,
+ __m128i* mask_lo,
+ __m128i* mask_hi,
+ __m128i* dst_lo,
+ __m128i* dst_hi)
{
- __m128i sLo, sHi;
- __m128i aLo, aHi;
-
- pixMultiply_2x128 ( srcLo, srcHi, maskLo, maskHi, &sLo, &sHi);
- pixMultiply_2x128 (alphaLo, alphaHi, maskLo, maskHi, &aLo, &aHi);
-
- over_2x128 (&sLo, &sHi, &aLo, &aHi, dstLo, dstHi);
-}
+ __m128i s_lo, s_hi;
+ __m128i a_lo, a_hi;
-static force_inline void
-cachePrefetch (__m128i* addr)
-{
- _mm_prefetch (addr, _MM_HINT_T0);
-}
+ pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
+ pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
-static force_inline void
-cachePrefetchNext (__m128i* addr)
-{
- _mm_prefetch (addr + 4, _MM_HINT_T0); // 64 bytes ahead
+ over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
}
/* load 4 pixels from a 16-byte boundary aligned address */
static force_inline __m128i
-load128Aligned (__m128i* src)
+load_128_aligned (__m128i* src)
{
return _mm_load_si128 (src);
}
/* load 4 pixels from a unaligned address */
static force_inline __m128i
-load128Unaligned (const __m128i* src)
+load_128_unaligned (const __m128i* src)
{
return _mm_loadu_si128 (src);
}
-/* save 4 pixels using Write Combining memory on a 16-byte boundary aligned address */
+/* save 4 pixels using Write Combining memory on a 16-byte
+ * boundary aligned address
+ */
static force_inline void
-save128WriteCombining (__m128i* dst, __m128i data)
+save_128_write_combining (__m128i* dst,
+ __m128i data)
{
_mm_stream_si128 (dst, data);
}
/* save 4 pixels on a 16-byte boundary aligned address */
static force_inline void
-save128Aligned (__m128i* dst, __m128i data)
+save_128_aligned (__m128i* dst,
+ __m128i data)
{
_mm_store_si128 (dst, data);
}
/* save 4 pixels on a unaligned address */
static force_inline void
-save128Unaligned (__m128i* dst, __m128i data)
+save_128_unaligned (__m128i* dst,
+ __m128i data)
{
_mm_storeu_si128 (dst, data);
}
-/* -------------------------------------------------------------------------------------------------
- * MMX inlines
- */
-
-static force_inline __m64
-unpack_32_1x64 (uint32_t data)
+static force_inline __m128i
+load_32_1x128 (uint32_t data)
{
- return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64());
+ return _mm_cvtsi32_si128 (data);
}
-static force_inline __m64
-expandAlpha_1x64 (__m64 data)
+static force_inline __m128i
+expand_alpha_rev_1x128 (__m128i data)
{
- return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 3, 3, 3));
+ return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
}
-static force_inline __m64
-expandAlphaRev_1x64 (__m64 data)
+static force_inline __m128i
+expand_pixel_8_1x128 (uint8_t data)
{
- return _mm_shuffle_pi16 (data, _MM_SHUFFLE(0, 0, 0, 0));
+ return _mm_shufflelo_epi16 (
+ unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
}
-static force_inline __m64
-expandPixel_8_1x64 (uint8_t data)
+static force_inline __m128i
+pix_multiply_1x128 (__m128i data,
+ __m128i alpha)
{
- return _mm_shuffle_pi16 (unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE(0, 0, 0, 0));
+ return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
+ mask_0080),
+ mask_0101);
}
-static force_inline __m64
-pixMultiply_1x64 (__m64 data, __m64 alpha)
+static force_inline __m128i
+pix_add_multiply_1x128 (__m128i* src,
+ __m128i* alpha_dst,
+ __m128i* dst,
+ __m128i* alpha_src)
{
- return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
- xMask0080),
- xMask0101);
-}
+ __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
+ __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
-static force_inline __m64
-pixAddMultiply_1x64 (__m64* src, __m64* alphaDst, __m64* dst, __m64* alphaSrc)
-{
- return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (*src, *alphaDst),
- xMask0080),
- _mm_mullo_pi16 (*dst, *alphaSrc)),
- xMask0101);
+ return _mm_adds_epu8 (t1, t2);
}
-static force_inline __m64
-negate_1x64 (__m64 data)
+static force_inline __m128i
+negate_1x128 (__m128i data)
{
- return _mm_xor_si64 (data, xMask00ff);
+ return _mm_xor_si128 (data, mask_00ff);
}
-static force_inline __m64
-invertColors_1x64 (__m64 data)
+static force_inline __m128i
+invert_colors_1x128 (__m128i data)
{
- return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 0, 1, 2));
+ return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
}
-static force_inline __m64
-over_1x64 (__m64 src, __m64 alpha, __m64 dst)
+static force_inline __m128i
+over_1x128 (__m128i src, __m128i alpha, __m128i dst)
{
- return _mm_adds_pu8 (src, pixMultiply_1x64 (dst, negate_1x64 (alpha)));
+ return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
}
-static force_inline __m64
-inOver_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
+static force_inline __m128i
+in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
{
- return over_1x64 (pixMultiply_1x64 (*src, *mask),
- pixMultiply_1x64 (*alpha, *mask),
- *dst);
+ return over_1x128 (pix_multiply_1x128 (*src, *mask),
+ pix_multiply_1x128 (*alpha, *mask),
+ *dst);
}
-static force_inline __m64
-overRevNonPre_1x64 (__m64 src, __m64 dst)
+static force_inline __m128i
+over_rev_non_pre_1x128 (__m128i src, __m128i dst)
{
- __m64 alpha = expandAlpha_1x64 (src);
+ __m128i alpha = expand_alpha_1x128 (src);
- return over_1x64 (pixMultiply_1x64 (invertColors_1x64 (src),
- _mm_or_si64 (alpha, xMaskAlpha)),
- alpha,
- dst);
+ return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
+ _mm_or_si128 (alpha, mask_alpha)),
+ alpha,
+ dst);
}
static force_inline uint32_t
-pack_1x64_32( __m64 data )
+pack_1x128_32 (__m128i data)
{
- return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64()));
+ return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
}
-/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
- *
- * 00RR00GG00BB
- *
- * --- Expanding 565 in the low word ---
- *
- * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
- * m = m & (01f0003f001f);
- * m = m * (008404100840);
- * m = m >> 8;
- *
- * Note the trick here - the top word is shifted by another nibble to
- * avoid it bumping into the middle word
- */
-static force_inline __m64
-expand565_16_1x64 (uint16_t pixel)
+static force_inline __m128i
+expand565_16_1x128 (uint16_t pixel)
{
- __m64 p;
- __m64 t1, t2;
-
- p = _mm_cvtsi32_si64 ((uint32_t) pixel);
-
- t1 = _mm_slli_si64 (p, 36 - 11);
- t2 = _mm_slli_si64 (p, 16 - 5);
+ __m128i m = _mm_cvtsi32_si128 (pixel);
- p = _mm_or_si64 (t1, p);
- p = _mm_or_si64 (t2, p);
- p = _mm_and_si64 (p, xMask565rgb);
- p = _mm_mullo_pi16 (p, xMask565Unpack);
+ m = unpack_565_to_8888 (m);
- return _mm_srli_pi16 (p, 8);
+ return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
}
-/* -------------------------------------------------------------------------------------------------
- * Compose Core transformations
- */
static force_inline uint32_t
-coreCombineOverUPixelsse2 (uint32_t src, uint32_t dst)
+core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
{
- uint8_t a;
- __m64 ms;
+ uint8_t a;
+ __m128i xmms;
a = src >> 24;
if (a == 0xff)
{
- return src;
+ return src;
}
else if (src)
{
- ms = unpack_32_1x64 (src);
- return pack_1x64_32 (over_1x64 (ms, expandAlpha_1x64 (ms), unpack_32_1x64 (dst)));
+ xmms = unpack_32_1x128 (src);
+ return pack_1x128_32 (
+ over_1x128 (xmms, expand_alpha_1x128 (xmms),
+ unpack_32_1x128 (dst)));
}
return dst;
if (pm)
{
- __m64 ms, mm;
+ __m128i ms, mm;
+
+ mm = unpack_32_1x128 (*pm);
+ mm = expand_alpha_1x128 (mm);
- mm = unpack_32_1x64 (*pm);
- mm = expandAlpha_1x64 (mm);
-
- ms = unpack_32_1x64 (s);
- ms = pixMultiply_1x64 (ms, mm);
+ ms = unpack_32_1x128 (s);
+ ms = pix_multiply_1x128 (ms, mm);
- s = pack_1x64_32 (ms);
+ s = pack_1x128_32 (ms);
}
return s;
static force_inline __m128i
combine4 (const __m128i *ps, const __m128i *pm)
{
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmMskLo, xmmMskHi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_msk_lo, xmm_msk_hi;
__m128i s;
-
+
if (pm)
{
- xmmMskLo = load128Unaligned (pm);
+ xmm_msk_lo = load_128_unaligned (pm);
- if (isTransparent (xmmMskLo))
+ if (is_transparent (xmm_msk_lo))
return _mm_setzero_si128 ();
}
-
- s = load128Unaligned (ps);
-
+
+ s = load_128_unaligned (ps);
+
if (pm)
{
- unpack_128_2x128 (s, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmMskLo, &xmmMskLo, &xmmMskHi);
-
- expandAlpha_2x128 (xmmMskLo, xmmMskHi, &xmmMskLo, &xmmMskHi);
-
- pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMskLo, &xmmMskHi, &xmmSrcLo, &xmmSrcHi);
-
- s = pack_2x128_128 (xmmSrcLo, xmmSrcHi);
+ unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
+
+ expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_msk_lo, &xmm_msk_hi,
+ &xmm_src_lo, &xmm_src_hi);
+
+ s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
}
return s;
}
static force_inline void
-coreCombineOverUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
+core_combine_over_u_sse2_mask (uint32_t * pd,
+ const uint32_t* ps,
+ const uint32_t* pm,
+ int w)
{
uint32_t s, d;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmAlphaLo, xmmAlphaHi;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
/* Align dst on a 16-byte boundary */
- while (w &&
- ((unsigned long)pd & 15))
+ while (w && ((unsigned long)pd & 15))
{
- d = *pd;
- s = combine1 (ps, pm);
+ d = *pd;
+ s = combine1 (ps, pm);
- *pd++ = coreCombineOverUPixelsse2 (s, d);
+ if (s)
+ *pd = core_combine_over_u_pixel_sse2 (s, d);
+ pd++;
ps++;
- if (pm)
- pm++;
- w--;
+ pm++;
+ w--;
}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
-
- /* I'm loading unaligned because I'm not sure about the address alignment. */
- xmmSrcHi = combine4 ((__m128i*)ps, (__m128i*)pm);
-
- if (isOpaque (xmmSrcHi))
- {
- save128Aligned ((__m128i*)pd, xmmSrcHi);
- }
- else if (!isZero (xmmSrcHi))
- {
- xmmDstHi = load128Aligned ((__m128i*) pd);
+ __m128i mask = load_128_unaligned ((__m128i *)pm);
+
+ if (!is_zero (mask))
+ {
+ __m128i src;
+ __m128i src_hi, src_lo;
+ __m128i mask_hi, mask_lo;
+ __m128i alpha_hi, alpha_lo;
+
+ src = load_128_unaligned ((__m128i *)ps);
+
+ if (is_opaque (_mm_and_si128 (src, mask)))
+ {
+ save_128_aligned ((__m128i *)pd, src);
+ }
+ else
+ {
+ __m128i dst = load_128_aligned ((__m128i *)pd);
+ __m128i dst_hi, dst_lo;
+
+ unpack_128_2x128 (mask, &mask_lo, &mask_hi);
+ unpack_128_2x128 (src, &src_lo, &src_hi);
+
+ expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
+ pix_multiply_2x128 (&src_lo, &src_hi,
+ &mask_lo, &mask_hi,
+ &src_lo, &src_hi);
+
+ unpack_128_2x128 (dst, &dst_lo, &dst_hi);
+
+ expand_alpha_2x128 (src_lo, src_hi,
+ &alpha_lo, &alpha_hi);
+
+ over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
+ &dst_lo, &dst_hi);
+
+ save_128_aligned (
+ (__m128i *)pd,
+ pack_2x128_128 (dst_lo, dst_hi));
+ }
+ }
+
+ pm += 4;
+ ps += 4;
+ pd += 4;
+ w -= 4;
+ }
+ while (w)
+ {
+ d = *pd;
+ s = combine1 (ps, pm);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
+ if (s)
+ *pd = core_combine_over_u_pixel_sse2 (s, d);
+ pd++;
+ ps++;
+ pm++;
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
+ w--;
+ }
+}
- over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
+static force_inline void
+core_combine_over_u_sse2_no_mask (uint32_t * pd,
+ const uint32_t* ps,
+ int w)
+{
+ uint32_t s, d;
- /* rebuid the 4 pixel data and save*/
- save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
- }
+ /* Align dst on a 16-byte boundary */
+ while (w && ((unsigned long)pd & 15))
+ {
+ d = *pd;
+ s = *ps;
- w -= 4;
- ps += 4;
- pd += 4;
- if (pm)
- pm += 4;
+ if (s)
+ *pd = core_combine_over_u_pixel_sse2 (s, d);
+ pd++;
+ ps++;
+ w--;
}
+ while (w >= 4)
+ {
+ __m128i src;
+ __m128i src_hi, src_lo, dst_hi, dst_lo;
+ __m128i alpha_hi, alpha_lo;
+
+ src = load_128_unaligned ((__m128i *)ps);
+
+ if (!is_zero (src))
+ {
+ if (is_opaque (src))
+ {
+ save_128_aligned ((__m128i *)pd, src);
+ }
+ else
+ {
+ __m128i dst = load_128_aligned ((__m128i *)pd);
+
+ unpack_128_2x128 (src, &src_lo, &src_hi);
+ unpack_128_2x128 (dst, &dst_lo, &dst_hi);
+
+ expand_alpha_2x128 (src_lo, src_hi,
+ &alpha_lo, &alpha_hi);
+ over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
+ &dst_lo, &dst_hi);
+
+ save_128_aligned (
+ (__m128i *)pd,
+ pack_2x128_128 (dst_lo, dst_hi));
+ }
+ }
+
+ ps += 4;
+ pd += 4;
+ w -= 4;
+ }
while (w)
{
- d = *pd;
- s = combine1 (ps, pm);
+ d = *pd;
+ s = *ps;
- *pd++ = coreCombineOverUPixelsse2 (s, d);
+ if (s)
+ *pd = core_combine_over_u_pixel_sse2 (s, d);
+ pd++;
ps++;
- if (pm)
- pm++;
- w--;
+
+ w--;
}
}
static force_inline void
-coreCombineOverReverseUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
+sse2_combine_over_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
- uint32_t s, d;
+ if (pm)
+ core_combine_over_u_sse2_mask (pd, ps, pm, w);
+ else
+ core_combine_over_u_sse2_no_mask (pd, ps, w);
+}
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmAlphaLo, xmmAlphaHi;
+static void
+sse2_combine_over_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
+{
+ uint32_t s, d;
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
/* Align dst on a 16-byte boundary */
while (w &&
((unsigned long)pd & 15))
{
- d = *pd;
- s = combine1 (ps, pm);
+ d = *pd;
+ s = combine1 (ps, pm);
- *pd++ = coreCombineOverUPixelsse2 (d, s);
- w--;
+ *pd++ = core_combine_over_u_pixel_sse2 (d, s);
+ w--;
ps++;
if (pm)
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
+ /* I'm loading unaligned because I'm not sure
+ * about the address alignment.
+ */
+ xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
- /* I'm loading unaligned because I'm not sure about the address alignment. */
- xmmSrcHi = combine4 ((__m128i*)ps, (__m128i*)pm);
- xmmDstHi = load128Aligned ((__m128i*) pd);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
- expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
+ over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_src_lo, &xmm_src_hi);
- over_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmSrcLo, &xmmSrcHi);
+ /* rebuid the 4 pixel data and save*/
+ save_128_aligned ((__m128i*)pd,
+ pack_2x128_128 (xmm_src_lo, xmm_src_hi));
- /* rebuid the 4 pixel data and save*/
- save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmSrcLo, xmmSrcHi));
+ w -= 4;
+ ps += 4;
+ pd += 4;
- w -= 4;
- ps += 4;
- pd += 4;
if (pm)
pm += 4;
}
while (w)
{
- d = *pd;
- s = combine1 (ps, pm);
+ d = *pd;
+ s = combine1 (ps, pm);
- *pd++ = coreCombineOverUPixelsse2 (d, s);
+ *pd++ = core_combine_over_u_pixel_sse2 (d, s);
ps++;
- w--;
+ w--;
if (pm)
pm++;
}
}
static force_inline uint32_t
-coreCombineInUPixelsse2 (uint32_t src, uint32_t dst)
+core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
{
uint32_t maska = src >> 24;
if (maska == 0)
{
- return 0;
+ return 0;
}
else if (maska != 0xff)
{
- return pack_1x64_32(pixMultiply_1x64 (unpack_32_1x64 (dst), expandAlpha_1x64 (unpack_32_1x64 (src))));
+ return pack_1x128_32 (
+ pix_multiply_1x128 (unpack_32_1x128 (dst),
+ expand_alpha_1x128 (unpack_32_1x128 (src))));
}
return dst;
}
-static force_inline void
-coreCombineInUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
+static void
+sse2_combine_in_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, d;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
while (w && ((unsigned long) pd & 15))
{
- s = combine1 (ps, pm);
- d = *pd;
+ s = combine1 (ps, pm);
+ d = *pd;
- *pd++ = coreCombineInUPixelsse2 (d, s);
- w--;
+ *pd++ = core_combine_in_u_pixel_sse2 (d, s);
+ w--;
ps++;
if (pm)
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+ xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
- xmmDstHi = load128Aligned ((__m128i*) pd);
- xmmSrcHi = combine4 ((__m128i*) ps, (__m128i*) pm);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
- expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
+ save_128_aligned ((__m128i*)pd,
+ pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
- ps += 4;
- pd += 4;
- w -= 4;
+ ps += 4;
+ pd += 4;
+ w -= 4;
if (pm)
pm += 4;
}
while (w)
{
- s = combine1 (ps, pm);
- d = *pd;
+ s = combine1 (ps, pm);
+ d = *pd;
- *pd++ = coreCombineInUPixelsse2 (d, s);
- w--;
+ *pd++ = core_combine_in_u_pixel_sse2 (d, s);
+ w--;
ps++;
if (pm)
pm++;
}
}
-static force_inline void
-coreCombineReverseInUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
+static void
+sse2_combine_in_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, d;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
while (w && ((unsigned long) pd & 15))
{
- s = combine1 (ps, pm);
- d = *pd;
+ s = combine1 (ps, pm);
+ d = *pd;
- *pd++ = coreCombineInUPixelsse2 (s, d);
+ *pd++ = core_combine_in_u_pixel_sse2 (s, d);
ps++;
- w--;
+ w--;
if (pm)
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
-
- xmmDstHi = load128Aligned ((__m128i*) pd);
- xmmSrcHi = combine4 ((__m128i*) ps, (__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+ xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
- pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_src_lo, &xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
- save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- ps += 4;
- pd += 4;
- w -= 4;
+ ps += 4;
+ pd += 4;
+ w -= 4;
if (pm)
pm += 4;
}
while (w)
{
- s = combine1 (ps, pm);
- d = *pd;
+ s = combine1 (ps, pm);
+ d = *pd;
- *pd++ = coreCombineInUPixelsse2 (s, d);
- w--;
+ *pd++ = core_combine_in_u_pixel_sse2 (s, d);
+ w--;
ps++;
if (pm)
pm++;
}
}
-static force_inline void
-coreCombineReverseOutUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
+static void
+sse2_combine_out_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
- uint32_t s = combine1 (ps, pm);
- uint32_t d = *pd;
+ uint32_t s = combine1 (ps, pm);
+ uint32_t d = *pd;
+
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (d), negate_1x128 (
+ expand_alpha_1x128 (unpack_32_1x128 (s)))));
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (s)))));
if (pm)
pm++;
ps++;
- w--;
+ w--;
}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
while (w >= 4)
{
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
-
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
- xmmSrcHi = combine4 ((__m128i*)ps, (__m128i*)pm);
- xmmDstHi = load128Aligned ((__m128i*) pd);
+ xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- negate_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
- pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi);
+ pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_src_lo, &xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
- save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- ps += 4;
- pd += 4;
+ ps += 4;
+ pd += 4;
if (pm)
pm += 4;
- w -= 4;
+
+ w -= 4;
}
while (w)
{
- uint32_t s = combine1 (ps, pm);
- uint32_t d = *pd;
+ uint32_t s = combine1 (ps, pm);
+ uint32_t d = *pd;
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (s)))));
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (d), negate_1x128 (
+ expand_alpha_1x128 (unpack_32_1x128 (s)))));
ps++;
if (pm)
pm++;
- w--;
+ w--;
}
}
-static force_inline void
-coreCombineOutUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
+static void
+sse2_combine_out_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
- uint32_t s = combine1 (ps, pm);
- uint32_t d = *pd;
-
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
- w--;
+ uint32_t s = combine1 (ps, pm);
+ uint32_t d = *pd;
+
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (s), negate_1x128 (
+ expand_alpha_1x128 (unpack_32_1x128 (d)))));
+ w--;
ps++;
if (pm)
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
while (w >= 4)
{
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
+ xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
- xmmSrcHi = combine4 ((__m128i*) ps, (__m128i*)pm);
- xmmDstHi = load128Aligned ((__m128i*) pd);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
- negate_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
- pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
- ps += 4;
- pd += 4;
- w -= 4;
+ ps += 4;
+ pd += 4;
+ w -= 4;
if (pm)
pm += 4;
}
while (w)
{
- uint32_t s = combine1 (ps, pm);
- uint32_t d = *pd;
-
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
- w--;
+ uint32_t s = combine1 (ps, pm);
+ uint32_t d = *pd;
+
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (s), negate_1x128 (
+ expand_alpha_1x128 (unpack_32_1x128 (d)))));
+ w--;
ps++;
if (pm)
pm++;
}
static force_inline uint32_t
-coreCombineAtopUPixelsse2 (uint32_t src, uint32_t dst)
+core_combine_atop_u_pixel_sse2 (uint32_t src,
+ uint32_t dst)
{
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
+ __m128i s = unpack_32_1x128 (src);
+ __m128i d = unpack_32_1x128 (dst);
- __m64 sa = negate_1x64 (expandAlpha_1x64 (s));
- __m64 da = expandAlpha_1x64 (d);
+ __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
+ __m128i da = expand_alpha_1x128 (d);
- return pack_1x64_32 (pixAddMultiply_1x64 (&s, &da, &d, &sa));
+ return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
}
-static force_inline void
-coreCombineAtopUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
+static void
+sse2_combine_atop_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, d;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
- __m128i xmmAlphaDstLo, xmmAlphaDstHi;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+ __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
while (w && ((unsigned long) pd & 15))
{
- s = combine1 (ps, pm);
- d = *pd;
+ s = combine1 (ps, pm);
+ d = *pd;
- *pd++ = coreCombineAtopUPixelsse2 (s, d);
- w--;
+ *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
+ w--;
ps++;
if (pm)
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
+ xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
- xmmSrcHi = combine4 ((__m128i*)ps, (__m128i*)pm);
- xmmDstHi = load128Aligned ((__m128i*) pd);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
- expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
+ negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
- negate_2x128 (xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
+ pix_add_multiply_2x128 (
+ &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
- pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
- &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
- &xmmDstLo, &xmmDstHi );
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
- ps += 4;
- pd += 4;
- w -= 4;
+ ps += 4;
+ pd += 4;
+ w -= 4;
if (pm)
pm += 4;
}
while (w)
{
- s = combine1 (ps, pm);
- d = *pd;
+ s = combine1 (ps, pm);
+ d = *pd;
- *pd++ = coreCombineAtopUPixelsse2 (s, d);
- w--;
+ *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
+ w--;
ps++;
if (pm)
pm++;
}
static force_inline uint32_t
-coreCombineReverseAtopUPixelsse2 (uint32_t src, uint32_t dst)
+core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
+ uint32_t dst)
{
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
+ __m128i s = unpack_32_1x128 (src);
+ __m128i d = unpack_32_1x128 (dst);
- __m64 sa = expandAlpha_1x64 (s);
- __m64 da = negate_1x64 (expandAlpha_1x64 (d));
+ __m128i sa = expand_alpha_1x128 (s);
+ __m128i da = negate_1x128 (expand_alpha_1x128 (d));
- return pack_1x64_32 (pixAddMultiply_1x64 (&s, &da, &d, &sa));
+ return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
}
-static force_inline void
-coreCombineReverseAtopUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
+static void
+sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, d;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
- __m128i xmmAlphaDstLo, xmmAlphaDstHi;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+ __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
while (w && ((unsigned long) pd & 15))
{
- s = combine1 (ps, pm);
- d = *pd;
+ s = combine1 (ps, pm);
+ d = *pd;
- *pd++ = coreCombineReverseAtopUPixelsse2 (s, d);
+ *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
ps++;
- w--;
+ w--;
if (pm)
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
-
- xmmSrcHi = combine4 ((__m128i*)ps, (__m128i*)pm);
- xmmDstHi = load128Aligned ((__m128i*) pd);
+ xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
- expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
- negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
+ negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
- pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
- &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
- &xmmDstLo, &xmmDstHi );
+ pix_add_multiply_2x128 (
+ &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
- save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- ps += 4;
- pd += 4;
- w -= 4;
+ ps += 4;
+ pd += 4;
+ w -= 4;
if (pm)
pm += 4;
}
while (w)
{
- s = combine1 (ps, pm);
- d = *pd;
+ s = combine1 (ps, pm);
+ d = *pd;
- *pd++ = coreCombineReverseAtopUPixelsse2 (s, d);
+ *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
ps++;
- w--;
+ w--;
if (pm)
pm++;
}
}
static force_inline uint32_t
-coreCombineXorUPixelsse2 (uint32_t src, uint32_t dst)
+core_combine_xor_u_pixel_sse2 (uint32_t src,
+ uint32_t dst)
{
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
+ __m128i s = unpack_32_1x128 (src);
+ __m128i d = unpack_32_1x128 (dst);
- __m64 negD = negate_1x64 (expandAlpha_1x64 (d));
- __m64 negS = negate_1x64 (expandAlpha_1x64 (s));
+ __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
+ __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
- return pack_1x64_32 (pixAddMultiply_1x64 (&s, &negD, &d, &negS));
+ return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
}
-static force_inline void
-coreCombineXorUsse2 (uint32_t* dst, const uint32_t* src, const uint32_t *mask, int width)
+static void
+sse2_combine_xor_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
{
int w = width;
uint32_t s, d;
uint32_t* pd = dst;
const uint32_t* ps = src;
const uint32_t* pm = mask;
-
- __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
- __m128i xmmDst, xmmDstLo, xmmDstHi;
- __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
- __m128i xmmAlphaDstLo, xmmAlphaDstHi;
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+ __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
while (w && ((unsigned long) pd & 15))
{
- s = combine1 (ps, pm);
- d = *pd;
+ s = combine1 (ps, pm);
+ d = *pd;
- *pd++ = coreCombineXorUPixelsse2 (s, d);
- w--;
+ *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
+ w--;
ps++;
if (pm)
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
+ xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
+ xmm_dst = load_128_aligned ((__m128i*) pd);
- xmmSrc = combine4 ((__m128i*) ps, (__m128i*) pm);
- xmmDst = load128Aligned ((__m128i*) pd);
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
- expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
+ negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
- negate_2x128 (xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
- negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
+ pix_add_multiply_2x128 (
+ &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
- pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
- &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
- &xmmDstLo, &xmmDstHi );
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
- ps += 4;
- pd += 4;
- w -= 4;
+ ps += 4;
+ pd += 4;
+ w -= 4;
if (pm)
pm += 4;
}
while (w)
{
- s = combine1 (ps, pm);
- d = *pd;
+ s = combine1 (ps, pm);
+ d = *pd;
- *pd++ = coreCombineXorUPixelsse2 (s, d);
- w--;
+ *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
+ w--;
ps++;
if (pm)
pm++;
}
static force_inline void
-coreCombineAddUsse2 (uint32_t* dst, const uint32_t* src, const uint32_t* mask, int width)
+sse2_combine_add_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dst,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width)
{
int w = width;
- uint32_t s,d;
+ uint32_t s, d;
uint32_t* pd = dst;
const uint32_t* ps = src;
const uint32_t* pm = mask;
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
- s = combine1 (ps, pm);
- d = *pd;
+ s = combine1 (ps, pm);
+ d = *pd;
+
ps++;
if (pm)
pm++;
- *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
- w--;
+ *pd++ = _mm_cvtsi128_si32 (
+ _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
+ w--;
}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
while (w >= 4)
{
__m128i s;
-
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
-
- s = combine4((__m128i*)ps,(__m128i*)pm);
-
- save128Aligned( (__m128i*)pd,
- _mm_adds_epu8( s, load128Aligned ((__m128i*)pd)) );
- pd += 4;
- ps += 4;
+
+ s = combine4 ((__m128i*)ps, (__m128i*)pm);
+
+ save_128_aligned (
+ (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
+
+ pd += 4;
+ ps += 4;
if (pm)
pm += 4;
- w -= 4;
+ w -= 4;
}
while (w--)
{
- s = combine1 (ps, pm);
- d = *pd;
+ s = combine1 (ps, pm);
+ d = *pd;
+
ps++;
- *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
+ *pd++ = _mm_cvtsi128_si32 (
+ _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
if (pm)
pm++;
}
}
static force_inline uint32_t
-coreCombineSaturateUPixelsse2 (uint32_t src, uint32_t dst)
+core_combine_saturate_u_pixel_sse2 (uint32_t src,
+ uint32_t dst)
{
- __m64 ms = unpack_32_1x64 (src);
- __m64 md = unpack_32_1x64 (dst);
+ __m128i ms = unpack_32_1x128 (src);
+ __m128i md = unpack_32_1x128 (dst);
uint32_t sa = src >> 24;
uint32_t da = ~dst >> 24;
if (sa > da)
{
- ms = pixMultiply_1x64 (ms, expandAlpha_1x64 (unpack_32_1x64 (IntDiv(da, sa) << 24)));
+ ms = pix_multiply_1x128 (
+ ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
}
- return pack_1x64_32 (_mm_adds_pu16 (md, ms));
+ return pack_1x128_32 (_mm_adds_epu16 (md, ms));
}
-static force_inline void
-coreCombineSaturateUsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+static void
+sse2_combine_saturate_u (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
- uint32_t s,d;
-
- uint32_t packCmp;
- __m128i xmmSrc, xmmDst;
+ uint32_t s, d;
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ uint32_t pack_cmp;
+ __m128i xmm_src, xmm_dst;
while (w && (unsigned long)pd & 15)
{
- s = combine1 (ps, pm);
- d = *pd;
- *pd++ = coreCombineSaturateUPixelsse2 (s, d);
- w--;
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+ w--;
ps++;
if (pm)
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
-
- xmmDst = load128Aligned ((__m128i*)pd);
- xmmSrc = combine4 ((__m128i*)ps, (__m128i*)pm);
-
- packCmp = _mm_movemask_epi8 (_mm_cmpgt_epi32 (_mm_srli_epi32 (xmmSrc, 24),
- _mm_srli_epi32 (_mm_xor_si128 (xmmDst, Maskff000000), 24)));
-
- /* if some alpha src is grater than respective ~alpha dst */
- if (packCmp)
- {
- s = combine1 (ps++, pm);
- d = *pd;
- *pd++ = coreCombineSaturateUPixelsse2 (s, d);
+ xmm_dst = load_128_aligned ((__m128i*)pd);
+ xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
+
+ pack_cmp = _mm_movemask_epi8 (
+ _mm_cmpgt_epi32 (
+ _mm_srli_epi32 (xmm_src, 24),
+ _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
+
+ /* if some alpha src is grater than respective ~alpha dst */
+ if (pack_cmp)
+ {
+ s = combine1 (ps++, pm);
+ d = *pd;
+ *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
if (pm)
pm++;
- s = combine1 (ps++, pm);
- d = *pd;
- *pd++ = coreCombineSaturateUPixelsse2 (s, d);
+ s = combine1 (ps++, pm);
+ d = *pd;
+ *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
if (pm)
pm++;
- s = combine1 (ps++, pm);
- d = *pd;
- *pd++ = coreCombineSaturateUPixelsse2 (s, d);
+ s = combine1 (ps++, pm);
+ d = *pd;
+ *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
if (pm)
pm++;
- s = combine1 (ps++, pm);
- d = *pd;
- *pd++ = coreCombineSaturateUPixelsse2 (s, d);
+ s = combine1 (ps++, pm);
+ d = *pd;
+ *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
if (pm)
pm++;
- }
- else
- {
- save128Aligned ((__m128i*)pd, _mm_adds_epu8 (xmmDst, xmmSrc));
+ }
+ else
+ {
+ save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
- pd += 4;
- ps += 4;
+ pd += 4;
+ ps += 4;
if (pm)
pm += 4;
- }
+ }
- w -= 4;
+ w -= 4;
}
while (w--)
{
- s = combine1 (ps, pm);
- d = *pd;
- *pd++ = coreCombineSaturateUPixelsse2 (s, d);
+ s = combine1 (ps, pm);
+ d = *pd;
+
+ *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
ps++;
if (pm)
pm++;
}
}
-static force_inline void
-coreCombineSrcCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
+static void
+sse2_combine_src_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, m;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmMaskLo, xmmMaskHi;
- __m128i xmmDstLo, xmmDstHi;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
while (w && (unsigned long)pd & 15)
{
- s = *ps++;
- m = *pm++;
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
- w--;
+ s = *ps++;
+ m = *pm++;
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
+ w--;
}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
-
- xmmSrcHi = load128Unaligned ((__m128i*)ps);
- xmmMaskHi = load128Unaligned ((__m128i*)pm);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
- pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
- save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
}
while (w)
{
- s = *ps++;
- m = *pm++;
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
- w--;
+ s = *ps++;
+ m = *pm++;
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
+ w--;
}
}
static force_inline uint32_t
-coreCombineOverCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
+core_combine_over_ca_pixel_sse2 (uint32_t src,
+ uint32_t mask,
+ uint32_t dst)
{
- __m64 s = unpack_32_1x64 (src);
- __m64 expAlpha = expandAlpha_1x64 (s);
- __m64 unpkMask = unpack_32_1x64 (mask);
- __m64 unpkDst = unpack_32_1x64 (dst);
+ __m128i s = unpack_32_1x128 (src);
+ __m128i expAlpha = expand_alpha_1x128 (s);
+ __m128i unpk_mask = unpack_32_1x128 (mask);
+ __m128i unpk_dst = unpack_32_1x128 (dst);
- return pack_1x64_32 (inOver_1x64 (&s, &expAlpha, &unpkMask, &unpkDst));
+ return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
}
-static force_inline void
-coreCombineOverCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
+static void
+sse2_combine_over_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, m, d;
- __m128i xmmAlphaLo, xmmAlphaHi;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmMaskLo, xmmMaskHi;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
while (w && (unsigned long)pd & 15)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
- *pd++ = coreCombineOverCPixelsse2 (s, m, d);
- w--;
+ *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
+ w--;
}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
- xmmDstHi = load128Aligned ((__m128i*)pd);
- xmmSrcHi = load128Unaligned ((__m128i*)ps);
- xmmMaskHi = load128Unaligned ((__m128i*)pm);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
- inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
}
while (w)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
- *pd++ = coreCombineOverCPixelsse2 (s, m, d);
- w--;
+ *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
+ w--;
}
}
static force_inline uint32_t
-coreCombineOverReverseCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
+core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
+ uint32_t mask,
+ uint32_t dst)
{
- __m64 d = unpack_32_1x64 (dst);
+ __m128i d = unpack_32_1x128 (dst);
- return pack_1x64_32(over_1x64 (d, expandAlpha_1x64 (d), pixMultiply_1x64 (unpack_32_1x64 (src), unpack_32_1x64 (mask))));
+ return pack_1x128_32 (
+ over_1x128 (d, expand_alpha_1x128 (d),
+ pix_multiply_1x128 (unpack_32_1x128 (src),
+ unpack_32_1x128 (mask))));
}
-static force_inline void
-coreCombineOverReverseCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
+static void
+sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, m, d;
- __m128i xmmAlphaLo, xmmAlphaHi;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmMaskLo, xmmMaskHi;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
while (w && (unsigned long)pd & 15)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
- *pd++ = coreCombineOverReverseCPixelsse2 (s, m, d);
- w--;
+ *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
+ w--;
}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
-
- xmmDstHi = load128Aligned ((__m128i*)pd);
- xmmSrcHi = load128Unaligned ((__m128i*)ps);
- xmmMaskHi = load128Unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
- expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
- pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
- over_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi);
-
- save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmMaskLo, xmmMaskHi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
}
while (w)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
- *pd++ = coreCombineOverReverseCPixelsse2 (s, m, d);
- w--;
+ *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
+ w--;
}
}
-static force_inline void
-coreCombineInCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+static void
+sse2_combine_in_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, m, d;
- __m128i xmmAlphaLo, xmmAlphaHi;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmMaskLo, xmmMaskHi;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
while (w && (unsigned long)pd & 15)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
- expandAlpha_1x64 (unpack_32_1x64 (d))));
- w--;
- }
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
+ expand_alpha_1x128 (unpack_32_1x128 (d))));
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ w--;
+ }
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
- xmmDstHi = load128Aligned ((__m128i*)pd);
- xmmSrcHi = load128Unaligned ((__m128i*)ps);
- xmmMaskHi = load128Unaligned ((__m128i*)pm);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
- expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
- pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
- pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
+ pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
- save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
}
while (w)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
- expandAlpha_1x64 (unpack_32_1x64 (d))));
- w--;
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (s), unpack_32_1x128 (m)),
+ expand_alpha_1x128 (unpack_32_1x128 (d))));
+
+ w--;
}
}
-static force_inline void
-coreCombineInReverseCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+static void
+sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, m, d;
- __m128i xmmAlphaLo, xmmAlphaHi;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmMaskLo, xmmMaskHi;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
while (w && (unsigned long)pd & 15)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
- pixMultiply_1x64 (unpack_32_1x64 (m),
- expandAlpha_1x64 (unpack_32_1x64 (s)))));
- w--;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (d),
+ pix_multiply_1x128 (unpack_32_1x128 (m),
+ expand_alpha_1x128 (unpack_32_1x128 (s)))));
+ w--;
}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
-
- xmmDstHi = load128Aligned ((__m128i*)pd);
- xmmSrcHi = load128Unaligned ((__m128i*)ps);
- xmmMaskHi = load128Unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
- pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaLo, &xmmAlphaHi, &xmmAlphaLo, &xmmAlphaHi);
-
- pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
-
- save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+ pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
}
while (w)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
- pixMultiply_1x64 (unpack_32_1x64 (m),
- expandAlpha_1x64 (unpack_32_1x64 (s)))));
- w--;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (d),
+ pix_multiply_1x128 (unpack_32_1x128 (m),
+ expand_alpha_1x128 (unpack_32_1x128 (s)))));
+ w--;
}
}
-static force_inline void
-coreCombineOutCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+static void
+sse2_combine_out_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, m, d;
- __m128i xmmAlphaLo, xmmAlphaHi;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmMaskLo, xmmMaskHi;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
while (w && (unsigned long)pd & 15)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
- negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
- w--;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (s), unpack_32_1x128 (m)),
+ negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
+ w--;
}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
-
- xmmDstHi = load128Aligned ((__m128i*)pd);
- xmmSrcHi = load128Unaligned ((__m128i*)ps);
- xmmMaskHi = load128Unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
- expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
- negate_2x128 (xmmAlphaLo, xmmAlphaHi, &xmmAlphaLo, &xmmAlphaHi);
-
- pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
- pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
-
- save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+ negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+ pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
}
while (w)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
- negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
- w--;
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (s), unpack_32_1x128 (m)),
+ negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
+
+ w--;
}
}
-static force_inline void
-coreCombineOutReverseCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+static void
+sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, m, d;
- __m128i xmmAlphaLo, xmmAlphaHi;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmMaskLo, xmmMaskHi;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
while (w && (unsigned long)pd & 15)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
- negate_1x64 (pixMultiply_1x64 (unpack_32_1x64 (m),
- expandAlpha_1x64 (unpack_32_1x64 (s))))));
- w--;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (d),
+ negate_1x128 (pix_multiply_1x128 (
+ unpack_32_1x128 (m),
+ expand_alpha_1x128 (unpack_32_1x128 (s))))));
+ w--;
}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
- xmmDstHi = load128Aligned ((__m128i*)pd);
- xmmSrcHi = load128Unaligned ((__m128i*)ps);
- xmmMaskHi = load128Unaligned ((__m128i*)pm);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
+ pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
- pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi);
+ negate_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
- negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
+ pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
- pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
- save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
}
while (w)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
- negate_1x64 (pixMultiply_1x64 (unpack_32_1x64 (m),
- expandAlpha_1x64 (unpack_32_1x64 (s))))));
- w--;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (d),
+ negate_1x128 (pix_multiply_1x128 (
+ unpack_32_1x128 (m),
+ expand_alpha_1x128 (unpack_32_1x128 (s))))));
+ w--;
}
}
static force_inline uint32_t
-coreCombineAtopCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
+core_combine_atop_ca_pixel_sse2 (uint32_t src,
+ uint32_t mask,
+ uint32_t dst)
{
- __m64 m = unpack_32_1x64 (mask);
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
- __m64 sa = expandAlpha_1x64 (s);
- __m64 da = expandAlpha_1x64 (d);
+ __m128i m = unpack_32_1x128 (mask);
+ __m128i s = unpack_32_1x128 (src);
+ __m128i d = unpack_32_1x128 (dst);
+ __m128i sa = expand_alpha_1x128 (s);
+ __m128i da = expand_alpha_1x128 (d);
- s = pixMultiply_1x64 (s, m);
- m = negate_1x64 (pixMultiply_1x64 (m, sa));
+ s = pix_multiply_1x128 (s, m);
+ m = negate_1x128 (pix_multiply_1x128 (m, sa));
- return pack_1x64_32 (pixAddMultiply_1x64 (&d, &m, &s, &da));
+ return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
}
-static force_inline void
-coreCombineAtopCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+static void
+sse2_combine_atop_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, m, d;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
- __m128i xmmAlphaDstLo, xmmAlphaDstHi;
- __m128i xmmMaskLo, xmmMaskHi;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+ __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
while (w && (unsigned long)pd & 15)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
- *pd++ = coreCombineAtopCPixelsse2 (s, m, d);
- w--;
+ *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
+ w--;
}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
-
- xmmDstHi = load128Aligned ((__m128i*)pd);
- xmmSrcHi = load128Unaligned ((__m128i*)ps);
- xmmMaskHi = load128Unaligned ((__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi);
+ pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ pix_add_multiply_2x128 (
+ &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
+ while (w)
+ {
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
- expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
-
- pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
- pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
-
- negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
- pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
- &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
- &xmmDstLo, &xmmDstHi);
-
- save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
- }
-
- while (w)
- {
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = coreCombineAtopCPixelsse2 (s, m, d);
- w--;
+ *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
+ w--;
}
}
static force_inline uint32_t
-coreCombineReverseAtopCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
+core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
+ uint32_t mask,
+ uint32_t dst)
{
- __m64 m = unpack_32_1x64 (mask);
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
+ __m128i m = unpack_32_1x128 (mask);
+ __m128i s = unpack_32_1x128 (src);
+ __m128i d = unpack_32_1x128 (dst);
- __m64 da = negate_1x64 (expandAlpha_1x64 (d));
- __m64 sa = expandAlpha_1x64 (s);
+ __m128i da = negate_1x128 (expand_alpha_1x128 (d));
+ __m128i sa = expand_alpha_1x128 (s);
- s = pixMultiply_1x64 (s, m);
- m = pixMultiply_1x64 (m, sa);
+ s = pix_multiply_1x128 (s, m);
+ m = pix_multiply_1x128 (m, sa);
- return pack_1x64_32 (pixAddMultiply_1x64 (&d, &m, &s, &da));
+ return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
}
-static force_inline void
-coreCombineReverseAtopCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+static void
+sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, m, d;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
- __m128i xmmAlphaDstLo, xmmAlphaDstHi;
- __m128i xmmMaskLo, xmmMaskHi;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+ __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
while (w && (unsigned long)pd & 15)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
- *pd++ = coreCombineReverseAtopCPixelsse2 (s, m, d);
- w--;
+ *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
+ w--;
}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
-
- xmmDstHi = load128Aligned ((__m128i*)pd);
- xmmSrcHi = load128Unaligned ((__m128i*)ps);
- xmmMaskHi = load128Unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
- expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
-
- pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
- pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
-
- negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
-
- pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
- &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
- &xmmDstLo, &xmmDstHi);
-
- save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi);
+ pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ pix_add_multiply_2x128 (
+ &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
}
while (w)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
- *pd++ = coreCombineReverseAtopCPixelsse2 (s, m, d);
- w--;
+ *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
+ w--;
}
}
static force_inline uint32_t
-coreCombineXorCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
+core_combine_xor_ca_pixel_sse2 (uint32_t src,
+ uint32_t mask,
+ uint32_t dst)
{
- __m64 a = unpack_32_1x64 (mask);
- __m64 s = unpack_32_1x64 (src);
- __m64 d = unpack_32_1x64 (dst);
-
- __m64 alphaDst = negate_1x64 (pixMultiply_1x64 (a, expandAlpha_1x64 (s)));
- __m64 dest = pixMultiply_1x64 (s, a);
- __m64 alphaSrc = negate_1x64 (expandAlpha_1x64 (d));
-
- return pack_1x64_32 (pixAddMultiply_1x64 (&d,
- &alphaDst,
- &dest,
- &alphaSrc));
+ __m128i a = unpack_32_1x128 (mask);
+ __m128i s = unpack_32_1x128 (src);
+ __m128i d = unpack_32_1x128 (dst);
+
+ __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
+ a, expand_alpha_1x128 (s)));
+ __m128i dest = pix_multiply_1x128 (s, a);
+ __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
+
+ return pack_1x128_32 (pix_add_multiply_1x128 (&d,
+ &alpha_dst,
+ &dest,
+ &alpha_src));
}
-static force_inline void
-coreCombineXorCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+static void
+sse2_combine_xor_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, m, d;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
- __m128i xmmAlphaDstLo, xmmAlphaDstHi;
- __m128i xmmMaskLo, xmmMaskHi;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+ __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
while (w && (unsigned long)pd & 15)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
- *pd++ = coreCombineXorCPixelsse2 (s, m, d);
- w--;
+ *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
+ w--;
}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
-
- xmmDstHi = load128Aligned ((__m128i*)pd);
- xmmSrcHi = load128Unaligned ((__m128i*)ps);
- xmmMaskHi = load128Unaligned ((__m128i*)pm);
-
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
- expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
-
- pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
- pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
-
- negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
- negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
- pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
- &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
- &xmmDstLo, &xmmDstHi);
-
- save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi);
+ pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+ &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+ &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+ negate_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ pix_add_multiply_2x128 (
+ &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
}
while (w)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
- *pd++ = coreCombineXorCPixelsse2 (s, m, d);
- w--;
+ *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
+ w--;
}
}
-static force_inline void
-coreCombineAddCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+static void
+sse2_combine_add_ca (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * pd,
+ const uint32_t * ps,
+ const uint32_t * pm,
+ int w)
{
uint32_t s, m, d;
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
- __m128i xmmMaskLo, xmmMaskHi;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask_lo, xmm_mask_hi;
while (w && (unsigned long)pd & 15)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (_mm_adds_pu8 (pixMultiply_1x64 (unpack_32_1x64 (s),
- unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
- w--;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x128_32 (
+ _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
+ unpack_32_1x128 (m)),
+ unpack_32_1x128 (d)));
+ w--;
}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)ps);
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)ps);
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
-
- xmmSrcHi = load128Unaligned ((__m128i*)ps);
- xmmMaskHi = load128Unaligned ((__m128i*)pm);
- xmmDstHi = load128Aligned ((__m128i*)pd);
-
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
-
- pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
-
- save128Aligned( (__m128i*)pd, pack_2x128_128 (_mm_adds_epu8 (xmmSrcLo, xmmDstLo),
- _mm_adds_epu8 (xmmSrcHi, xmmDstHi)));
-
- ps += 4;
- pd += 4;
- pm += 4;
- w -= 4;
+ xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+ xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+ xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_src_lo, &xmm_src_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (
+ _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
+ _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
+
+ ps += 4;
+ pd += 4;
+ pm += 4;
+ w -= 4;
}
while (w)
{
- s = *ps++;
- m = *pm++;
- d = *pd;
-
- *pd++ = pack_1x64_32 (_mm_adds_pu8 (pixMultiply_1x64 (unpack_32_1x64 (s),
- unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
- w--;
+ s = *ps++;
+ m = *pm++;
+ d = *pd;
+
+ *pd++ = pack_1x128_32 (
+ _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
+ unpack_32_1x128 (m)),
+ unpack_32_1x128 (d)));
+ w--;
}
}
-/* -------------------------------------------------------------------------------------------------
- * fbComposeSetupSSE2
- */
-static force_inline __m64
-createMask_16_64 (uint16_t mask)
-{
- return _mm_set1_pi16 (mask);
-}
-
static force_inline __m128i
-createMask_16_128 (uint16_t mask)
+create_mask_16_128 (uint16_t mask)
{
return _mm_set1_epi16 (mask);
}
-static force_inline __m64
-createMask_2x32_64 (uint32_t mask0, uint32_t mask1)
-{
- return _mm_set_pi32 (mask0, mask1);
-}
-
+/* Work around a code generation bug in Sun Studio 12. */
+#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
+# define create_mask_2x32_128(mask0, mask1) \
+ (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
+#else
static force_inline __m128i
-createMask_2x32_128 (uint32_t mask0, uint32_t mask1)
+create_mask_2x32_128 (uint32_t mask0,
+ uint32_t mask1)
{
return _mm_set_epi32 (mask0, mask1, mask0, mask1);
}
-
-/* SSE2 code patch for fbcompose.c */
-
-static void
-sse2CombineOverU (pixman_implementation_t *imp, pixman_op_t op,
- uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
- coreCombineOverUsse2 (dst, src, mask, width);
- _mm_empty();
-}
+#endif
static void
-sse2CombineOverReverseU (pixman_implementation_t *imp, pixman_op_t op,
- uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
+sse2_composite_over_n_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
{
- coreCombineOverReverseUsse2 (dst, src, mask, width);
- _mm_empty();
-}
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t src;
+ uint32_t *dst_line, *dst, d;
+ int32_t w;
+ int dst_stride;
+ __m128i xmm_src, xmm_alpha;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
-static void
-sse2CombineInU (pixman_implementation_t *imp, pixman_op_t op,
- uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
- coreCombineInUsse2 (dst, src, mask, width);
- _mm_empty();
-}
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
-static void
-sse2CombineInReverseU (pixman_implementation_t *imp, pixman_op_t op,
- uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
- coreCombineReverseInUsse2 (dst, src, mask, width);
- _mm_empty();
-}
+ if (src == 0)
+ return;
-static void
-sse2CombineOutU (pixman_implementation_t *imp, pixman_op_t op,
- uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
- coreCombineOutUsse2 (dst, src, mask, width);
- _mm_empty();
-}
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-static void
-sse2CombineOutReverseU (pixman_implementation_t *imp, pixman_op_t op,
- uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
- coreCombineReverseOutUsse2 (dst, src, mask, width);
- _mm_empty();
-}
+ xmm_src = expand_pixel_32_1x128 (src);
+ xmm_alpha = expand_alpha_1x128 (xmm_src);
-static void
-sse2CombineAtopU (pixman_implementation_t *imp, pixman_op_t op,
- uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
- coreCombineAtopUsse2 (dst, src, mask, width);
- _mm_empty();
-}
+ while (height--)
+ {
+ dst = dst_line;
+
+ dst_line += dst_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ d = *dst;
+ *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
+ xmm_alpha,
+ unpack_32_1x128 (d)));
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ /* rebuid the 4 pixel data and save*/
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ w -= 4;
+ dst += 4;
+ }
+
+ while (w)
+ {
+ d = *dst;
+ *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
+ xmm_alpha,
+ unpack_32_1x128 (d)));
+ w--;
+ }
-static void
-sse2CombineAtopReverseU (pixman_implementation_t *imp, pixman_op_t op,
- uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
- coreCombineReverseAtopUsse2 (dst, src, mask, width);
- _mm_empty();
+ }
}
static void
-sse2CombineXorU (pixman_implementation_t *imp, pixman_op_t op,
- uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
+sse2_composite_over_n_0565 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
{
- coreCombineXorUsse2 (dst, src, mask, width);
- _mm_empty();
-}
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t src;
+ uint16_t *dst_line, *dst, d;
+ int32_t w;
+ int dst_stride;
+ __m128i xmm_src, xmm_alpha;
+ __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
-static void
-sse2CombineAddU (pixman_implementation_t *imp, pixman_op_t op,
- uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
- coreCombineAddUsse2 (dst, src, mask, width);
- _mm_empty();
-}
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
-static void
-sse2CombineSaturateU (pixman_implementation_t *imp, pixman_op_t op,
- uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
- coreCombineSaturateUsse2 (dst, src, mask, width);
- _mm_empty();
-}
+ if (src == 0)
+ return;
-static void
-sse2CombineSrcC (pixman_implementation_t *imp, pixman_op_t op,
- uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
- coreCombineSrcCsse2 (dst, src, mask, width);
- _mm_empty();
-}
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-static void
-sse2CombineOverC (pixman_implementation_t *imp, pixman_op_t op,
- uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
- coreCombineOverCsse2 (dst, src, mask, width);
- _mm_empty();
-}
+ xmm_src = expand_pixel_32_1x128 (src);
+ xmm_alpha = expand_alpha_1x128 (xmm_src);
-static void
-sse2CombineOverReverseC (pixman_implementation_t *imp, pixman_op_t op,
- uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
- coreCombineOverReverseCsse2 (dst, src, mask, width);
- _mm_empty();
-}
+ while (height--)
+ {
+ dst = dst_line;
+
+ dst_line += dst_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ d = *dst;
+
+ *dst++ = pack_565_32_16 (
+ pack_1x128_32 (over_1x128 (xmm_src,
+ xmm_alpha,
+ expand565_16_1x128 (d))));
+ w--;
+ }
+
+ while (w >= 8)
+ {
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_565_128_4x128 (xmm_dst,
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+ over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_dst0, &xmm_dst1);
+ over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_dst2, &xmm_dst3);
+
+ xmm_dst = pack_565_4x128_128 (
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+ save_128_aligned ((__m128i*)dst, xmm_dst);
+
+ dst += 8;
+ w -= 8;
+ }
+
+ while (w--)
+ {
+ d = *dst;
+ *dst++ = pack_565_32_16 (
+ pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
+ expand565_16_1x128 (d))));
+ }
+ }
-static void
-sse2CombineInC (pixman_implementation_t *imp, pixman_op_t op,
- uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
- coreCombineInCsse2 (dst, src, mask, width);
- _mm_empty();
}
static void
-sse2CombineInReverseC (pixman_implementation_t *imp, pixman_op_t op,
- uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
+sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
{
- coreCombineInReverseCsse2 (dst, src, mask, width);
- _mm_empty();
-}
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t src;
+ uint32_t *dst_line, d;
+ uint32_t *mask_line, m;
+ uint32_t pack_cmp;
+ int dst_stride, mask_stride;
-static void
-sse2CombineOutC (pixman_implementation_t *imp, pixman_op_t op,
- uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
- coreCombineOutCsse2 (dst, src, mask, width);
- _mm_empty();
-}
+ __m128i xmm_src;
+ __m128i xmm_dst;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
-static void
-sse2CombineOutReverseC (pixman_implementation_t *imp, pixman_op_t op,
- uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
- coreCombineOutReverseCsse2 (dst, src, mask, width);
- _mm_empty();
-}
+ __m128i mmx_src, mmx_mask, mmx_dest;
-static void
-sse2CombineAtopC (pixman_implementation_t *imp, pixman_op_t op,
- uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
- coreCombineAtopCsse2 (dst, src, mask, width);
- _mm_empty();
-}
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
-static void
-sse2CombineAtopReverseC (pixman_implementation_t *imp, pixman_op_t op,
- uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
- coreCombineReverseAtopCsse2 (dst, src, mask, width);
- _mm_empty();
-}
+ if (src == 0)
+ return;
-static void
-sse2CombineXorC (pixman_implementation_t *imp, pixman_op_t op,
- uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
- coreCombineXorCsse2 (dst, src, mask, width);
- _mm_empty();
-}
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
-static void
-sse2CombineAddC (pixman_implementation_t *imp, pixman_op_t op,
- uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
- coreCombineAddCsse2 (dst, src, mask, width);
- _mm_empty();
-}
+ xmm_src = _mm_unpacklo_epi8 (
+ create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
+ mmx_src = xmm_src;
-/* -------------------------------------------------------------------------------------------------
- * fast_CompositeOver_n_8888
- */
+ while (height--)
+ {
+ int w = width;
+ const uint32_t *pm = (uint32_t *)mask_line;
+ uint32_t *pd = (uint32_t *)dst_line;
-static void
-sse2_CompositeOver_n_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int32_t xSrc,
- int32_t ySrc,
- int32_t xMask,
- int32_t yMask,
- int32_t xDst,
- int32_t yDst,
- int32_t width,
- int32_t height)
-{
- uint32_t src;
- uint32_t *dstLine, *dst, d;
- uint16_t w;
- int dstStride;
- __m128i xmmSrc, xmmAlpha;
- __m128i xmmDst, xmmDstLo, xmmDstHi;
+ dst_line += dst_stride;
+ mask_line += mask_stride;
- src = _pixman_image_get_solid(pSrc, pDst->bits.format);
+ while (w && (unsigned long)pd & 15)
+ {
+ m = *pm++;
- if (src == 0)
- return;
+ if (m)
+ {
+ d = *pd;
- fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
+ mmx_mask = unpack_32_1x128 (m);
+ mmx_dest = unpack_32_1x128 (d);
- xmmSrc = expandPixel_32_1x128 (src);
- xmmAlpha = expandAlpha_1x128 (xmmSrc);
+ *pd = pack_1x128_32 (
+ _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
+ mmx_dest));
+ }
- while (height--)
- {
- dst = dstLine;
+ pd++;
+ w--;
+ }
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)dst);
+ while (w >= 4)
+ {
+ xmm_mask = load_128_unaligned ((__m128i*)pm);
- dstLine += dstStride;
- w = width;
+ pack_cmp =
+ _mm_movemask_epi8 (
+ _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
- while (w && (unsigned long)dst & 15)
- {
- d = *dst;
- *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
- _mm_movepi64_pi64 (xmmAlpha),
- unpack_32_1x64 (d)));
- w--;
- }
+ /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
+ if (pack_cmp != 0xffff)
+ {
+ xmm_dst = load_128_aligned ((__m128i*)pd);
- cachePrefetch ((__m128i*)dst);
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
- while (w >= 4)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)dst);
+ pix_multiply_2x128 (&xmm_src, &xmm_src,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+ xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
- xmmDst = load128Aligned ((__m128i*)dst);
+ save_128_aligned (
+ (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
+ }
- unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
- over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDstLo, &xmmDstHi);
+ while (w)
+ {
+ m = *pm++;
- /* rebuid the 4 pixel data and save*/
- save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
+ if (m)
+ {
+ d = *pd;
- w -= 4;
- dst += 4;
- }
+ mmx_mask = unpack_32_1x128 (m);
+ mmx_dest = unpack_32_1x128 (d);
- while (w)
- {
- d = *dst;
- *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
- _mm_movepi64_pi64 (xmmAlpha),
- unpack_32_1x64 (d)));
- w--;
- }
+ *pd = pack_1x128_32 (
+ _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
+ mmx_dest));
+ }
+ pd++;
+ w--;
+ }
}
- _mm_empty();
+
}
-/* -------------------------------------------------------------------------------------------------
- * fast_CompositeOver_n_0565
- */
static void
-sse2_CompositeOver_n_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int32_t xSrc,
- int32_t ySrc,
- int32_t xMask,
- int32_t yMask,
- int32_t xDst,
- int32_t yDst,
- int32_t width,
- int32_t height)
+sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
{
- uint32_t src;
- uint16_t *dstLine, *dst, d;
- uint16_t w;
- int dstStride;
- __m128i xmmSrc, xmmAlpha;
- __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t src;
+ uint32_t *dst_line, d;
+ uint32_t *mask_line, m;
+ uint32_t pack_cmp;
+ int dst_stride, mask_stride;
+
+ __m128i xmm_src, xmm_alpha;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- src = _pixman_image_get_solid(pSrc, pDst->bits.format);
+ __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
if (src == 0)
- return;
+ return;
- fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
- xmmSrc = expandPixel_32_1x128 (src);
- xmmAlpha = expandAlpha_1x128 (xmmSrc);
+ xmm_src = _mm_unpacklo_epi8 (
+ create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
+ xmm_alpha = expand_alpha_1x128 (xmm_src);
+ mmx_src = xmm_src;
+ mmx_alpha = xmm_alpha;
while (height--)
{
- dst = dstLine;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)dst);
-
- dstLine += dstStride;
- w = width;
-
- while (w && (unsigned long)dst & 15)
- {
- d = *dst;
+ int w = width;
+ const uint32_t *pm = (uint32_t *)mask_line;
+ uint32_t *pd = (uint32_t *)dst_line;
+
+ dst_line += dst_stride;
+ mask_line += mask_stride;
+
+ while (w && (unsigned long)pd & 15)
+ {
+ m = *pm++;
+
+ if (m)
+ {
+ d = *pd;
+ mmx_mask = unpack_32_1x128 (m);
+ mmx_dest = unpack_32_1x128 (d);
+
+ *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
+ &mmx_alpha,
+ &mmx_mask,
+ &mmx_dest));
+ }
+
+ pd++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_mask = load_128_unaligned ((__m128i*)pm);
+
+ pack_cmp =
+ _mm_movemask_epi8 (
+ _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+ /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
+ if (pack_cmp != 0xffff)
+ {
+ xmm_dst = load_128_aligned ((__m128i*)pd);
+
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ in_over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ pd += 4;
+ pm += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ m = *pm++;
+
+ if (m)
+ {
+ d = *pd;
+ mmx_mask = unpack_32_1x128 (m);
+ mmx_dest = unpack_32_1x128 (d);
+
+ *pd = pack_1x128_32 (
+ in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
+ }
+
+ pd++;
+ w--;
+ }
+ }
- *dst++ = pack565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
- _mm_movepi64_pi64 (xmmAlpha),
- expand565_16_1x64 (d))));
- w--;
- }
+}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)dst);
+static void
+sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ uint32_t mask;
+ int32_t w;
+ int dst_stride, src_stride;
- while (w >= 8)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)dst);
+ __m128i xmm_mask;
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
- xmmDst = load128Aligned ((__m128i*)dst);
-
- unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
-
- over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDst0, &xmmDst1);
- over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDst2, &xmmDst3);
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
- xmmDst = pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
- save128Aligned ((__m128i*)dst, xmmDst);
+ mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
- dst += 8;
- w -= 8;
- }
+ xmm_mask = create_mask_16_128 (mask >> 24);
- while (w--)
- {
- d = *dst;
- *dst++ = pack565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
- _mm_movepi64_pi64 (xmmAlpha),
- expand565_16_1x64 (d))));
- }
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ uint32_t s = *src++;
+
+ if (s)
+ {
+ uint32_t d = *dst;
+
+ __m128i ms = unpack_32_1x128 (s);
+ __m128i alpha = expand_alpha_1x128 (ms);
+ __m128i dest = xmm_mask;
+ __m128i alpha_dst = unpack_32_1x128 (d);
+
+ *dst = pack_1x128_32 (
+ in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+ }
+ dst++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_src = load_128_unaligned ((__m128i*)src);
+
+ if (!is_zero (xmm_src))
+ {
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_mask, &xmm_mask,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ dst += 4;
+ src += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ uint32_t s = *src++;
+
+ if (s)
+ {
+ uint32_t d = *dst;
+
+ __m128i ms = unpack_32_1x128 (s);
+ __m128i alpha = expand_alpha_1x128 (ms);
+ __m128i mask = xmm_mask;
+ __m128i dest = unpack_32_1x128 (d);
+
+ *dst = pack_1x128_32 (
+ in_over_1x128 (&ms, &alpha, &mask, &dest));
+ }
+
+ dst++;
+ w--;
+ }
}
- _mm_empty();
}
-/* -------------------------------------------------------------------------------------------------
- * fast_CompositeOver_n_8888_8888_ca
- */
-
static void
-sse2_CompositeOver_n_8888_8888_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int32_t xSrc,
- int32_t ySrc,
- int32_t xMask,
- int32_t yMask,
- int32_t xDst,
- int32_t yDst,
- int32_t width,
- int32_t height)
+sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
{
- uint32_t src;
- uint32_t *dstLine, d;
- uint32_t *maskLine, m;
- uint32_t packCmp;
- int dstStride, maskStride;
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ int32_t w;
+ int dst_stride, src_stride;
- __m128i xmmSrc, xmmAlpha;
- __m128i xmmDst, xmmDstLo, xmmDstHi;
- __m128i xmmMask, xmmMaskLo, xmmMaskHi;
- __m64 mmxSrc, mmxAlpha, mmxMask, mmxDst;
-
- src = _pixman_image_get_solid(pSrc, pDst->bits.format);
-
- if (src == 0)
- return;
-
- fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
- fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1);
-
- xmmSrc = _mm_unpacklo_epi8 (createMask_2x32_128 (src, src), _mm_setzero_si128 ());
- xmmAlpha = expandAlpha_1x128 (xmmSrc);
- mmxSrc = _mm_movepi64_pi64 (xmmSrc);
- mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
while (height--)
{
- int w = width;
- const uint32_t *pm = (uint32_t *)maskLine;
- uint32_t *pd = (uint32_t *)dstLine;
-
- dstLine += dstStride;
- maskLine += maskStride;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
-
- while (w && (unsigned long)pd & 15)
- {
- m = *pm++;
-
- if (m)
- {
- d = *pd;
- mmxMask = unpack_32_1x64 (m);
- mmxDst = unpack_32_1x64 (d);
-
- *pd = pack_1x64_32 (inOver_1x64 (&mmxSrc,
- &mmxAlpha,
- &mmxMask,
- &mmxDst));
- }
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ *dst++ = *src++ | 0xff000000;
+ w--;
+ }
+
+ while (w >= 16)
+ {
+ __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
+
+ xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
+ xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
+ xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
+ xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
+
+ save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
+ save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
+ save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
+ save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
+
+ dst += 16;
+ src += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ *dst++ = *src++ | 0xff000000;
+ w--;
+ }
+ }
- pd++;
- w--;
- }
+}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)pd);
- cachePrefetch ((__m128i*)pm);
+static void
+sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ uint32_t mask;
+ int dst_stride, src_stride;
+ int32_t w;
- while (w >= 4)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)pd);
- cachePrefetchNext ((__m128i*)pm);
+ __m128i xmm_mask, xmm_alpha;
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- xmmMask = load128Unaligned ((__m128i*)pm);
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
- packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
+ mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
- /* if all bits in mask are zero, packCmp are equal to 0xffff */
- if (packCmp != 0xffff)
- {
- xmmDst = load128Aligned ((__m128i*)pd);
+ xmm_mask = create_mask_16_128 (mask >> 24);
+ xmm_alpha = mask_00ff;
- unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
- unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ uint32_t s = (*src++) | 0xff000000;
+ uint32_t d = *dst;
+
+ __m128i src = unpack_32_1x128 (s);
+ __m128i alpha = xmm_alpha;
+ __m128i mask = xmm_mask;
+ __m128i dest = unpack_32_1x128 (d);
+
+ *dst++ = pack_1x128_32 (
+ in_over_1x128 (&src, &alpha, &mask, &dest));
+
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_src = _mm_or_si128 (
+ load_128_unaligned ((__m128i*)src), mask_ff000000);
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask, &xmm_mask,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ dst += 4;
+ src += 4;
+ w -= 4;
+
+ }
+
+ while (w)
+ {
+ uint32_t s = (*src++) | 0xff000000;
+ uint32_t d = *dst;
+
+ __m128i src = unpack_32_1x128 (s);
+ __m128i alpha = xmm_alpha;
+ __m128i mask = xmm_mask;
+ __m128i dest = unpack_32_1x128 (d);
+
+ *dst++ = pack_1x128_32 (
+ in_over_1x128 (&src, &alpha, &mask, &dest));
+
+ w--;
+ }
+ }
- inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
+}
- save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
- }
+static void
+sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ int dst_stride, src_stride;
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
- pd += 4;
- pm += 4;
- w -= 4;
- }
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
- while (w)
- {
- m = *pm++;
+ dst = dst_line;
+ src = src_line;
- if (m)
- {
- d = *pd;
- mmxMask = unpack_32_1x64 (m);
- mmxDst = unpack_32_1x64 (d);
-
- *pd = pack_1x64_32 (inOver_1x64 (&mmxSrc,
- &mmxAlpha,
- &mmxMask,
- &mmxDst));
- }
+ while (height--)
+ {
+ sse2_combine_over_u (imp, op, dst, src, NULL, width);
- pd++;
- w--;
- }
+ dst += dst_stride;
+ src += src_stride;
}
-
- _mm_empty();
}
+static force_inline uint16_t
+composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
+{
+ __m128i ms;
-/* -------------------------------------------------------------------------------------------------
- * fast_composite_over_8888_n_8888
- */
+ ms = unpack_32_1x128 (src);
+ return pack_565_32_16 (
+ pack_1x128_32 (
+ over_1x128 (
+ ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
+}
static void
-sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int32_t xSrc,
- int32_t ySrc,
- int32_t xMask,
- int32_t yMask,
- int32_t xDst,
- int32_t yDst,
- int32_t width,
- int32_t height)
+sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
{
- uint32_t *dstLine, *dst;
- uint32_t *srcLine, *src;
- uint32_t mask;
- uint16_t w;
- int dstStride, srcStride;
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint16_t *dst_line, *dst, d;
+ uint32_t *src_line, *src, s;
+ int dst_stride, src_stride;
+ int32_t w;
- __m128i xmmMask;
- __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
- __m128i xmmDst, xmmDstLo, xmmDstHi;
- __m128i xmmAlphaLo, xmmAlphaHi;
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
- fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
- fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
- mask = _pixman_image_get_solid (pMask, pDst->bits.format);
-
- xmmMask = createMask_16_128 (mask >> 24);
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
while (height--)
{
- dst = dstLine;
- dstLine += dstStride;
- src = srcLine;
- srcLine += srcStride;
- w = width;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)dst);
- cachePrefetch ((__m128i*)src);
+ dst = dst_line;
+ src = src_line;
+
+ dst_line += dst_stride;
+ src_line += src_stride;
+ w = width;
+
+ /* Align dst on a 16-byte boundary */
+ while (w &&
+ ((unsigned long)dst & 15))
+ {
+ s = *src++;
+ d = *dst;
+
+ *dst++ = composite_over_8888_0565pixel (s, d);
+ w--;
+ }
+
+ /* It's a 8 pixel loop */
+ while (w >= 8)
+ {
+ /* I'm loading unaligned because I'm not sure
+ * about the address alignment.
+ */
+ xmm_src = load_128_unaligned ((__m128i*) src);
+ xmm_dst = load_128_aligned ((__m128i*) dst);
+
+ /* Unpacking */
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_565_128_4x128 (xmm_dst,
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ /* I'm loading next 4 pixels from memory
+ * before to optimze the memory read.
+ */
+ xmm_src = load_128_unaligned ((__m128i*) (src + 4));
+
+ over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst0, &xmm_dst1);
+
+ /* Unpacking */
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst2, &xmm_dst3);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_565_4x128_128 (
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+ w -= 8;
+ dst += 8;
+ src += 8;
+ }
+
+ while (w--)
+ {
+ s = *src++;
+ d = *dst;
+
+ *dst++ = composite_over_8888_0565pixel (s, d);
+ }
+ }
- while (w && (unsigned long)dst & 15)
- {
- uint32_t s = *src++;
- uint32_t d = *dst;
+}
- __m64 ms = unpack_32_1x64 (s);
- __m64 alpha = expandAlpha_1x64 (ms);
- __m64 dest = _mm_movepi64_pi64 (xmmMask);
- __m64 alphaDst = unpack_32_1x64 (d);
+static void
+sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t src, srca;
+ uint32_t *dst_line, *dst;
+ uint8_t *mask_line, *mask;
+ int dst_stride, mask_stride;
+ int32_t w;
+ uint32_t m, d;
- *dst++ = pack_1x64_32 (inOver_1x64 (&ms,
- &alpha,
- &dest,
- &alphaDst));
+ __m128i xmm_src, xmm_alpha, xmm_def;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- w--;
- }
+ __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)dst);
- cachePrefetch ((__m128i*)src);
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
- while (w >= 4)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)dst);
- cachePrefetchNext ((__m128i*)src);
+ srca = src >> 24;
+ if (src == 0)
+ return;
- xmmSrc = load128Unaligned ((__m128i*)src);
- xmmDst = load128Aligned ((__m128i*)dst);
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
- unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
+ xmm_def = create_mask_2x32_128 (src, src);
+ xmm_src = expand_pixel_32_1x128 (src);
+ xmm_alpha = expand_alpha_1x128 (xmm_src);
+ mmx_src = xmm_src;
+ mmx_alpha = xmm_alpha;
- inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMask, &xmmMask, &xmmDstLo, &xmmDstHi);
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ uint8_t m = *mask++;
+
+ if (m)
+ {
+ d = *dst;
+ mmx_mask = expand_pixel_8_1x128 (m);
+ mmx_dest = unpack_32_1x128 (d);
+
+ *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
+ &mmx_alpha,
+ &mmx_mask,
+ &mmx_dest));
+ }
+
+ w--;
+ dst++;
+ }
+
+ while (w >= 4)
+ {
+ m = *((uint32_t*)mask);
+
+ if (srca == 0xff && m == 0xffffffff)
+ {
+ save_128_aligned ((__m128i*)dst, xmm_def);
+ }
+ else if (m)
+ {
+ xmm_dst = load_128_aligned ((__m128i*) dst);
+ xmm_mask = unpack_32_1x128 (m);
+ xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+ /* Unpacking */
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ in_over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ w -= 4;
+ dst += 4;
+ mask += 4;
+ }
+
+ while (w)
+ {
+ uint8_t m = *mask++;
+
+ if (m)
+ {
+ d = *dst;
+ mmx_mask = expand_pixel_8_1x128 (m);
+ mmx_dest = unpack_32_1x128 (d);
+
+ *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
+ &mmx_alpha,
+ &mmx_mask,
+ &mmx_dest));
+ }
+
+ w--;
+ dst++;
+ }
+ }
- save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
+}
- dst += 4;
- src += 4;
- w -= 4;
- }
+static pixman_bool_t
+pixman_fill_sse2 (uint32_t *bits,
+ int stride,
+ int bpp,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t data)
+{
+ uint32_t byte_width;
+ uint8_t *byte_line;
- while (w)
- {
- uint32_t s = *src++;
- uint32_t d = *dst;
+ __m128i xmm_def;
- __m64 ms = unpack_32_1x64 (s);
- __m64 alpha = expandAlpha_1x64 (ms);
- __m64 mask = _mm_movepi64_pi64 (xmmMask);
- __m64 dest = unpack_32_1x64 (d);
+ if (bpp == 8)
+ {
+ uint8_t b;
+ uint16_t w;
- *dst++ = pack_1x64_32 (inOver_1x64 (&ms,
- &alpha,
- &mask,
- &dest));
+ stride = stride * (int) sizeof (uint32_t) / 1;
+ byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
+ byte_width = width;
+ stride *= 1;
- w--;
- }
+ b = data & 0xff;
+ w = (b << 8) | b;
+ data = (w << 16) | w;
}
+ else if (bpp == 16)
+ {
+ stride = stride * (int) sizeof (uint32_t) / 2;
+ byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+ byte_width = 2 * width;
+ stride *= 2;
- _mm_empty();
-}
-
-/* -------------------------------------------------------------------------------------------------
- * fast_Composite_over_x888_n_8888
- */
-static void
-sse2_Composite_over_x888_n_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int32_t xSrc,
- int32_t ySrc,
- int32_t xMask,
- int32_t yMask,
- int32_t xDst,
- int32_t yDst,
- int32_t width,
- int32_t height)
-{
- uint32_t *dstLine, *dst;
- uint32_t *srcLine, *src;
- uint32_t mask;
- int dstStride, srcStride;
- uint16_t w;
-
- __m128i xmmMask, xmmAlpha;
- __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
- __m128i xmmDst, xmmDstLo, xmmDstHi;
-
- fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
- fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
- mask = _pixman_image_get_solid (pMask, pDst->bits.format);
+ data = (data & 0xffff) * 0x00010001;
+ }
+ else if (bpp == 32)
+ {
+ stride = stride * (int) sizeof (uint32_t) / 4;
+ byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+ byte_width = 4 * width;
+ stride *= 4;
+ }
+ else
+ {
+ return FALSE;
+ }
- xmmMask = createMask_16_128 (mask >> 24);
- xmmAlpha = Mask00ff;
+ xmm_def = create_mask_2x32_128 (data, data);
while (height--)
{
- dst = dstLine;
- dstLine += dstStride;
- src = srcLine;
- srcLine += srcStride;
- w = width;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)dst);
- cachePrefetch ((__m128i*)src);
+ int w;
+ uint8_t *d = byte_line;
+ byte_line += stride;
+ w = byte_width;
+
+ if (w >= 1 && ((unsigned long)d & 1))
+ {
+ *(uint8_t *)d = data;
+ w -= 1;
+ d += 1;
+ }
+
+ while (w >= 2 && ((unsigned long)d & 3))
+ {
+ *(uint16_t *)d = data;
+ w -= 2;
+ d += 2;
+ }
+
+ while (w >= 4 && ((unsigned long)d & 15))
+ {
+ *(uint32_t *)d = data;
+
+ w -= 4;
+ d += 4;
+ }
+
+ while (w >= 128)
+ {
+ save_128_aligned ((__m128i*)(d), xmm_def);
+ save_128_aligned ((__m128i*)(d + 16), xmm_def);
+ save_128_aligned ((__m128i*)(d + 32), xmm_def);
+ save_128_aligned ((__m128i*)(d + 48), xmm_def);
+ save_128_aligned ((__m128i*)(d + 64), xmm_def);
+ save_128_aligned ((__m128i*)(d + 80), xmm_def);
+ save_128_aligned ((__m128i*)(d + 96), xmm_def);
+ save_128_aligned ((__m128i*)(d + 112), xmm_def);
+
+ d += 128;
+ w -= 128;
+ }
+
+ if (w >= 64)
+ {
+ save_128_aligned ((__m128i*)(d), xmm_def);
+ save_128_aligned ((__m128i*)(d + 16), xmm_def);
+ save_128_aligned ((__m128i*)(d + 32), xmm_def);
+ save_128_aligned ((__m128i*)(d + 48), xmm_def);
+
+ d += 64;
+ w -= 64;
+ }
+
+ if (w >= 32)
+ {
+ save_128_aligned ((__m128i*)(d), xmm_def);
+ save_128_aligned ((__m128i*)(d + 16), xmm_def);
+
+ d += 32;
+ w -= 32;
+ }
+
+ if (w >= 16)
+ {
+ save_128_aligned ((__m128i*)(d), xmm_def);
+
+ d += 16;
+ w -= 16;
+ }
+
+ while (w >= 4)
+ {
+ *(uint32_t *)d = data;
+
+ w -= 4;
+ d += 4;
+ }
+
+ if (w >= 2)
+ {
+ *(uint16_t *)d = data;
+ w -= 2;
+ d += 2;
+ }
+
+ if (w >= 1)
+ {
+ *(uint8_t *)d = data;
+ w -= 1;
+ d += 1;
+ }
+ }
- while (w && (unsigned long)dst & 15)
- {
- uint32_t s = (*src++) | 0xff000000;
- uint32_t d = *dst;
+ return TRUE;
+}
- __m64 src = unpack_32_1x64 (s);
- __m64 alpha = _mm_movepi64_pi64 (xmmAlpha);
- __m64 mask = _mm_movepi64_pi64 (xmmMask);
- __m64 dest = unpack_32_1x64 (d);
+static void
+sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t src, srca;
+ uint32_t *dst_line, *dst;
+ uint8_t *mask_line, *mask;
+ int dst_stride, mask_stride;
+ int32_t w;
+ uint32_t m;
- *dst++ = pack_1x64_32 (inOver_1x64 (&src,
- &alpha,
- &mask,
- &dest));
+ __m128i xmm_src, xmm_def;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- w--;
- }
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)dst);
- cachePrefetch ((__m128i*)src);
+ srca = src >> 24;
+ if (src == 0)
+ {
+ pixman_fill_sse2 (dest_image->bits.bits, dest_image->bits.rowstride,
+ PIXMAN_FORMAT_BPP (dest_image->bits.format),
+ dest_x, dest_y, width, height, 0);
+ return;
+ }
- while (w >= 4)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)dst);
- cachePrefetchNext ((__m128i*)src);
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
- xmmSrc = _mm_or_si128 (load128Unaligned ((__m128i*)src), Maskff000000);
- xmmDst = load128Aligned ((__m128i*)dst);
+ xmm_def = create_mask_2x32_128 (src, src);
+ xmm_src = expand_pixel_32_1x128 (src);
- unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ uint8_t m = *mask++;
+
+ if (m)
+ {
+ *dst = pack_1x128_32 (
+ pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
+ }
+ else
+ {
+ *dst = 0;
+ }
+
+ w--;
+ dst++;
+ }
+
+ while (w >= 4)
+ {
+ m = *((uint32_t*)mask);
+
+ if (srca == 0xff && m == 0xffffffff)
+ {
+ save_128_aligned ((__m128i*)dst, xmm_def);
+ }
+ else if (m)
+ {
+ xmm_mask = unpack_32_1x128 (m);
+ xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+ /* Unpacking */
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ pix_multiply_2x128 (&xmm_src, &xmm_src,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
+ }
+ else
+ {
+ save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
+ }
+
+ w -= 4;
+ dst += 4;
+ mask += 4;
+ }
+
+ while (w)
+ {
+ uint8_t m = *mask++;
+
+ if (m)
+ {
+ *dst = pack_1x128_32 (
+ pix_multiply_1x128 (
+ xmm_src, expand_pixel_8_1x128 (m)));
+ }
+ else
+ {
+ *dst = 0;
+ }
+
+ w--;
+ dst++;
+ }
+ }
- inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlpha, &xmmAlpha, &xmmMask, &xmmMask, &xmmDstLo, &xmmDstHi);
+}
- save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
+static void
+sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t src;
+ uint16_t *dst_line, *dst, d;
+ uint8_t *mask_line, *mask;
+ int dst_stride, mask_stride;
+ int32_t w;
+ uint32_t m;
+ __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
- dst += 4;
- src += 4;
- w -= 4;
+ __m128i xmm_src, xmm_alpha;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+ __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
- }
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
- while (w)
- {
- uint32_t s = (*src++) | 0xff000000;
- uint32_t d = *dst;
+ if (src == 0)
+ return;
- __m64 src = unpack_32_1x64 (s);
- __m64 alpha = _mm_movepi64_pi64 (xmmAlpha);
- __m64 mask = _mm_movepi64_pi64 (xmmMask);
- __m64 dest = unpack_32_1x64 (d);
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
- *dst++ = pack_1x64_32 (inOver_1x64 (&src,
- &alpha,
- &mask,
- &dest));
+ xmm_src = expand_pixel_32_1x128 (src);
+ xmm_alpha = expand_alpha_1x128 (xmm_src);
+ mmx_src = xmm_src;
+ mmx_alpha = xmm_alpha;
- w--;
- }
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ m = *mask++;
+
+ if (m)
+ {
+ d = *dst;
+ mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+ mmx_dest = expand565_16_1x128 (d);
+
+ *dst = pack_565_32_16 (
+ pack_1x128_32 (
+ in_over_1x128 (
+ &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+ }
+
+ w--;
+ dst++;
+ }
+
+ while (w >= 8)
+ {
+ xmm_dst = load_128_aligned ((__m128i*) dst);
+ unpack_565_128_4x128 (xmm_dst,
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+ m = *((uint32_t*)mask);
+ mask += 4;
+
+ if (m)
+ {
+ xmm_mask = unpack_32_1x128 (m);
+ xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+ /* Unpacking */
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ in_over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst0, &xmm_dst1);
+ }
+
+ m = *((uint32_t*)mask);
+ mask += 4;
+
+ if (m)
+ {
+ xmm_mask = unpack_32_1x128 (m);
+ xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+ /* Unpacking */
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+ in_over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst2, &xmm_dst3);
+ }
+
+ save_128_aligned (
+ (__m128i*)dst, pack_565_4x128_128 (
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+ w -= 8;
+ dst += 8;
+ }
+
+ while (w)
+ {
+ m = *mask++;
+
+ if (m)
+ {
+ d = *dst;
+ mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+ mmx_dest = expand565_16_1x128 (d);
+
+ *dst = pack_565_32_16 (
+ pack_1x128_32 (
+ in_over_1x128 (
+ &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+ }
+
+ w--;
+ dst++;
+ }
}
- _mm_empty();
}
-/* -------------------------------------------------------------------------------------------------
- * fast_composite_over_8888_8888
- */
static void
-sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int32_t xSrc,
- int32_t ySrc,
- int32_t xMask,
- int32_t yMask,
- int32_t xDst,
- int32_t yDst,
- int32_t width,
- int32_t height)
+sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
{
- int dstStride, srcStride;
- uint32_t *dstLine, *dst;
- uint32_t *srcLine, *src;
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint16_t *dst_line, *dst, d;
+ uint32_t *src_line, *src, s;
+ int dst_stride, src_stride;
+ int32_t w;
+ uint32_t opaque, zero;
- fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
- fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+ __m128i ms;
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
- dst = dstLine;
- src = srcLine;
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
while (height--)
{
- coreCombineOverUsse2 (dst, src, NULL, width);
-
- dst += dstStride;
- src += srcStride;
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ s = *src++;
+ d = *dst;
+
+ ms = unpack_32_1x128 (s);
+
+ *dst++ = pack_565_32_16 (
+ pack_1x128_32 (
+ over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
+ w--;
+ }
+
+ while (w >= 8)
+ {
+ /* First round */
+ xmm_src = load_128_unaligned ((__m128i*)src);
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ opaque = is_opaque (xmm_src);
+ zero = is_zero (xmm_src);
+
+ unpack_565_128_4x128 (xmm_dst,
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+
+ /* preload next round*/
+ xmm_src = load_128_unaligned ((__m128i*)(src + 4));
+
+ if (opaque)
+ {
+ invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_dst0, &xmm_dst1);
+ }
+ else if (!zero)
+ {
+ over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_dst0, &xmm_dst1);
+ }
+
+ /* Second round */
+ opaque = is_opaque (xmm_src);
+ zero = is_zero (xmm_src);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+
+ if (opaque)
+ {
+ invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_dst2, &xmm_dst3);
+ }
+ else if (!zero)
+ {
+ over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_dst2, &xmm_dst3);
+ }
+
+ save_128_aligned (
+ (__m128i*)dst, pack_565_4x128_128 (
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+ w -= 8;
+ src += 8;
+ dst += 8;
+ }
+
+ while (w)
+ {
+ s = *src++;
+ d = *dst;
+
+ ms = unpack_32_1x128 (s);
+
+ *dst++ = pack_565_32_16 (
+ pack_1x128_32 (
+ over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
+ w--;
+ }
}
- _mm_empty();
-}
-
-/* -------------------------------------------------------------------------------------------------
- * fast_composite_over_8888_0565
- */
-static force_inline uint16_t
-fast_composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
-{
- __m64 ms;
- ms = unpack_32_1x64 (src);
- return pack565_32_16( pack_1x64_32 (over_1x64 (ms,
- expandAlpha_1x64 (ms),
- expand565_16_1x64 (dst))));
}
static void
-sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int32_t xSrc,
- int32_t ySrc,
- int32_t xMask,
- int32_t yMask,
- int32_t xDst,
- int32_t yDst,
- int32_t width,
- int32_t height)
+sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
{
- uint16_t *dstLine, *dst, d;
- uint32_t *srcLine, *src, s;
- int dstStride, srcStride;
- uint16_t w;
-
- __m128i xmmAlphaLo, xmmAlphaHi;
- __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
- __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
-
- fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
- fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
-
-#if 0
- /* FIXME
- *
- * I copy the code from MMX one and keep the fixme.
- * If it's a problem there, probably is a problem here.
- */
- assert (pSrc->pDrawable == pMask->pDrawable);
-#endif
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t *dst_line, *dst, d;
+ uint32_t *src_line, *src, s;
+ int dst_stride, src_stride;
+ int32_t w;
+ uint32_t opaque, zero;
+
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst_lo, xmm_dst_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
while (height--)
{
- dst = dstLine;
- src = srcLine;
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 15)
+ {
+ s = *src++;
+ d = *dst;
+
+ *dst++ = pack_1x128_32 (
+ over_rev_non_pre_1x128 (
+ unpack_32_1x128 (s), unpack_32_1x128 (d)));
+
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ xmm_src_hi = load_128_unaligned ((__m128i*)src);
+
+ opaque = is_opaque (xmm_src_hi);
+ zero = is_zero (xmm_src_hi);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+
+ if (opaque)
+ {
+ invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+ else if (!zero)
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ w -= 4;
+ dst += 4;
+ src += 4;
+ }
+
+ while (w)
+ {
+ s = *src++;
+ d = *dst;
+
+ *dst++ = pack_1x128_32 (
+ over_rev_non_pre_1x128 (
+ unpack_32_1x128 (s), unpack_32_1x128 (d)));
+
+ w--;
+ }
+ }
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)src);
- cachePrefetch ((__m128i*)dst);
+}
- dstLine += dstStride;
- srcLine += srcStride;
- w = width;
+static void
+sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t src;
+ uint16_t *dst_line, *dst, d;
+ uint32_t *mask_line, *mask, m;
+ int dst_stride, mask_stride;
+ int w;
+ uint32_t pack_cmp;
- /* Align dst on a 16-byte boundary */
- while (w &&
- ((unsigned long)dst & 15))
- {
- s = *src++;
- d = *dst;
+ __m128i xmm_src, xmm_alpha;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+ __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
- *dst++ = fast_composite_over_8888_0565pixel (s, d);
- w--;
- }
+ __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)src);
- cachePrefetch ((__m128i*)dst);
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
- /* It's a 8 pixel loop */
- while (w >= 8)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)src);
- cachePrefetchNext ((__m128i*)dst);
+ if (src == 0)
+ return;
- /* I'm loading unaligned because I'm not sure about the address alignment. */
- xmmSrc = load128Unaligned ((__m128i*) src);
- xmmDst = load128Aligned ((__m128i*) dst);
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
- /* Unpacking */
- unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
- unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
+ xmm_src = expand_pixel_32_1x128 (src);
+ xmm_alpha = expand_alpha_1x128 (xmm_src);
+ mmx_src = xmm_src;
+ mmx_alpha = xmm_alpha;
- /* I'm loading next 4 pixels from memory before to optimze the memory read. */
- xmmSrc = load128Unaligned ((__m128i*) (src+4));
+ while (height--)
+ {
+ w = width;
+ mask = mask_line;
+ dst = dst_line;
+ mask_line += mask_stride;
+ dst_line += dst_stride;
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ m = *(uint32_t *) mask;
+
+ if (m)
+ {
+ d = *dst;
+ mmx_mask = unpack_32_1x128 (m);
+ mmx_dest = expand565_16_1x128 (d);
+
+ *dst = pack_565_32_16 (
+ pack_1x128_32 (
+ in_over_1x128 (
+ &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+ }
+
+ w--;
+ dst++;
+ mask++;
+ }
+
+ while (w >= 8)
+ {
+ /* First round */
+ xmm_mask = load_128_unaligned ((__m128i*)mask);
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ pack_cmp = _mm_movemask_epi8 (
+ _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+ unpack_565_128_4x128 (xmm_dst,
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+ /* preload next round */
+ xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
+
+ /* preload next round */
+ if (pack_cmp != 0xffff)
+ {
+ in_over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst0, &xmm_dst1);
+ }
+
+ /* Second round */
+ pack_cmp = _mm_movemask_epi8 (
+ _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+ if (pack_cmp != 0xffff)
+ {
+ in_over_2x128 (&xmm_src, &xmm_src,
+ &xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst2, &xmm_dst3);
+ }
+
+ save_128_aligned (
+ (__m128i*)dst, pack_565_4x128_128 (
+ &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+ w -= 8;
+ dst += 8;
+ mask += 8;
+ }
+
+ while (w)
+ {
+ m = *(uint32_t *) mask;
+
+ if (m)
+ {
+ d = *dst;
+ mmx_mask = unpack_32_1x128 (m);
+ mmx_dest = expand565_16_1x128 (d);
+
+ *dst = pack_565_32_16 (
+ pack_1x128_32 (
+ in_over_1x128 (
+ &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+ }
+
+ w--;
+ dst++;
+ mask++;
+ }
+ }
- over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDst0, &xmmDst1);
+}
- /* Unpacking */
- unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
- expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
+static void
+sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint8_t *dst_line, *dst;
+ uint8_t *mask_line, *mask;
+ int dst_stride, mask_stride;
+ uint32_t d, m;
+ uint32_t src;
+ int32_t w;
- over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDst2, &xmmDst3);
+ __m128i xmm_alpha;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
- w -= 8;
- dst += 8;
- src += 8;
- }
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
- while (w--)
- {
- s = *src++;
- d = *dst;
+ xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
- *dst++ = fast_composite_over_8888_0565pixel (s, d);
- }
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ m = (uint32_t) *mask++;
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x128_32 (
+ pix_multiply_1x128 (
+ pix_multiply_1x128 (xmm_alpha,
+ unpack_32_1x128 (m)),
+ unpack_32_1x128 (d)));
+ w--;
+ }
+
+ while (w >= 16)
+ {
+ xmm_mask = load_128_unaligned ((__m128i*)mask);
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ mask += 16;
+ dst += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ m = (uint32_t) *mask++;
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x128_32 (
+ pix_multiply_1x128 (
+ pix_multiply_1x128 (
+ xmm_alpha, unpack_32_1x128 (m)),
+ unpack_32_1x128 (d)));
+ w--;
+ }
}
- _mm_empty();
}
-/* -------------------------------------------------------------------------------------------------
- * fast_CompositeOver_n_8_8888
- */
-
static void
-sse2_CompositeOver_n_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int32_t xSrc,
- int32_t ySrc,
- int32_t xMask,
- int32_t yMask,
- int32_t xDst,
- int32_t yDst,
- int32_t width,
- int32_t height)
+sse2_composite_in_n_8 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
{
- uint32_t src, srca;
- uint32_t *dstLine, *dst;
- uint8_t *maskLine, *mask;
- int dstStride, maskStride;
- uint16_t w;
- uint32_t m, d;
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint8_t *dst_line, *dst;
+ int dst_stride;
+ uint32_t d;
+ uint32_t src;
+ int32_t w;
- __m128i xmmSrc, xmmAlpha, xmmDef;
- __m128i xmmDst, xmmDstLo, xmmDstHi;
- __m128i xmmMask, xmmMaskLo, xmmMaskHi;
+ __m128i xmm_alpha;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- __m64 mmxSrc, mmxAlpha, mmxMask, mmxDest;
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
- src = _pixman_image_get_solid(pSrc, pDst->bits.format);
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
- srca = src >> 24;
- if (src == 0)
- return;
+ xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
- fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
- fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+ src = src >> 24;
- xmmDef = createMask_2x32_128 (src, src);
- xmmSrc = expandPixel_32_1x128 (src);
- xmmAlpha = expandAlpha_1x128 (xmmSrc);
- mmxSrc = _mm_movepi64_pi64 (xmmSrc);
- mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
+ if (src == 0xff)
+ return;
- while (height--)
+ if (src == 0x00)
{
- dst = dstLine;
- dstLine += dstStride;
- mask = maskLine;
- maskLine += maskStride;
- w = width;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)mask);
- cachePrefetch ((__m128i*)dst);
-
- while (w && (unsigned long)dst & 15)
- {
- uint8_t m = *mask++;
+ pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
+ 8, dest_x, dest_y, width, height, src);
- if (m)
- {
- d = *dst;
- mmxMask = expandPixel_8_1x64 (m);
- mmxDest = unpack_32_1x64 (d);
-
- *dst = pack_1x64_32 (inOver_1x64 (&mmxSrc,
- &mmxAlpha,
- &mmxMask,
- &mmxDest));
- }
+ return;
+ }
- w--;
- dst++;
- }
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ w = width;
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x128_32 (
+ pix_multiply_1x128 (
+ xmm_alpha,
+ unpack_32_1x128 (d)));
+ w--;
+ }
+
+ while (w >= 16)
+ {
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+ &xmm_dst_lo, &xmm_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ dst += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x128_32 (
+ pix_multiply_1x128 (
+ xmm_alpha,
+ unpack_32_1x128 (d)));
+ w--;
+ }
+ }
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)mask);
- cachePrefetch ((__m128i*)dst);
+}
- while (w >= 4)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)mask);
- cachePrefetchNext ((__m128i*)dst);
+static void
+sse2_composite_in_8_8 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint8_t *dst_line, *dst;
+ uint8_t *src_line, *src;
+ int src_stride, dst_stride;
+ int32_t w;
+ uint32_t s, d;
- m = *((uint32_t*)mask);
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- if (srca == 0xff && m == 0xffffffff)
- {
- save128Aligned ((__m128i*)dst, xmmDef);
- }
- else if (m)
- {
- xmmDst = load128Aligned ((__m128i*) dst);
- xmmMask = unpack_32_1x128 (m);
- xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
- /* Unpacking */
- unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
- unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ s = (uint32_t) *src++;
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x128_32 (
+ pix_multiply_1x128 (
+ unpack_32_1x128 (s), unpack_32_1x128 (d)));
+ w--;
+ }
+
+ while (w >= 16)
+ {
+ xmm_src = load_128_unaligned ((__m128i*)src);
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_dst_lo, &xmm_dst_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ src += 16;
+ dst += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ s = (uint32_t) *src++;
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x128_32 (
+ pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
+ w--;
+ }
+ }
- expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
+}
- inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
+static void
+sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint8_t *dst_line, *dst;
+ uint8_t *mask_line, *mask;
+ int dst_stride, mask_stride;
+ int32_t w;
+ uint32_t src;
+ uint32_t m, d;
- save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
- }
+ __m128i xmm_alpha;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
- w -= 4;
- dst += 4;
- mask += 4;
- }
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
- while (w)
- {
- uint8_t m = *mask++;
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
- if (m)
- {
- d = *dst;
- mmxMask = expandPixel_8_1x64 (m);
- mmxDest = unpack_32_1x64 (d);
-
- *dst = pack_1x64_32 (inOver_1x64 (&mmxSrc,
- &mmxAlpha,
- &mmxMask,
- &mmxDest));
- }
+ xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
- w--;
- dst++;
- }
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ m = (uint32_t) *mask++;
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x128_32 (
+ _mm_adds_epu16 (
+ pix_multiply_1x128 (
+ xmm_alpha, unpack_32_1x128 (m)),
+ unpack_32_1x128 (d)));
+ w--;
+ }
+
+ while (w >= 16)
+ {
+ xmm_mask = load_128_unaligned ((__m128i*)mask);
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+ &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_mask_lo, &xmm_mask_hi);
+
+ xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
+ xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+ mask += 16;
+ dst += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ m = (uint32_t) *mask++;
+ d = (uint32_t) *dst;
+
+ *dst++ = (uint8_t) pack_1x128_32 (
+ _mm_adds_epu16 (
+ pix_multiply_1x128 (
+ xmm_alpha, unpack_32_1x128 (m)),
+ unpack_32_1x128 (d)));
+
+ w--;
+ }
}
- _mm_empty();
}
-/* -------------------------------------------------------------------------------------------------
- * fast_CompositeOver_n_8_8888
- */
-
-pixman_bool_t
-pixmanFillsse2 (uint32_t *bits,
- int stride,
- int bpp,
- int x,
- int y,
- int width,
- int height,
- uint32_t data)
+static void
+sse2_composite_add_n_8 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
{
- uint32_t byte_width;
- uint8_t *byte_line;
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint8_t *dst_line, *dst;
+ int dst_stride;
+ int32_t w;
+ uint32_t src;
- __m128i xmmDef;
+ __m128i xmm_src;
- if (bpp == 16 && (data >> 16 != (data & 0xffff)))
- return FALSE;
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
- if (bpp != 16 && bpp != 32)
- return FALSE;
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
- if (bpp == 16)
- {
- stride = stride * (int) sizeof (uint32_t) / 2;
- byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
- byte_width = 2 * width;
- stride *= 2;
- }
- else
- {
- stride = stride * (int) sizeof (uint32_t) / 4;
- byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
- byte_width = 4 * width;
- stride *= 4;
- }
+ src >>= 24;
- cachePrefetch ((__m128i*)byte_line);
- xmmDef = createMask_2x32_128 (data, data);
+ if (src == 0x00)
+ return;
- while (height--)
+ if (src == 0xff)
{
- int w;
- uint8_t *d = byte_line;
- byte_line += stride;
- w = byte_width;
-
+ pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
+ 8, dest_x, dest_y, width, height, 0xff);
- cachePrefetchNext ((__m128i*)d);
-
- while (w >= 2 && ((unsigned long)d & 3))
- {
- *(uint16_t *)d = data;
- w -= 2;
- d += 2;
- }
+ return;
+ }
- while (w >= 4 && ((unsigned long)d & 15))
- {
- *(uint32_t *)d = data;
+ src = (src << 24) | (src << 16) | (src << 8) | src;
+ xmm_src = _mm_set_epi32 (src, src, src, src);
- w -= 4;
- d += 4;
- }
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ w = width;
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ *dst = (uint8_t)_mm_cvtsi128_si32 (
+ _mm_adds_epu8 (
+ xmm_src,
+ _mm_cvtsi32_si128 (*dst)));
+
+ w--;
+ dst++;
+ }
+
+ while (w >= 16)
+ {
+ save_128_aligned (
+ (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
+
+ dst += 16;
+ w -= 16;
+ }
+
+ while (w)
+ {
+ *dst = (uint8_t)_mm_cvtsi128_si32 (
+ _mm_adds_epu8 (
+ xmm_src,
+ _mm_cvtsi32_si128 (*dst)));
+
+ w--;
+ dst++;
+ }
+ }
- cachePrefetchNext ((__m128i*)d);
+}
- while (w >= 128)
- {
- cachePrefetch (((__m128i*)d) + 12);
-
- save128Aligned ((__m128i*)(d), xmmDef);
- save128Aligned ((__m128i*)(d+16), xmmDef);
- save128Aligned ((__m128i*)(d+32), xmmDef);
- save128Aligned ((__m128i*)(d+48), xmmDef);
- save128Aligned ((__m128i*)(d+64), xmmDef);
- save128Aligned ((__m128i*)(d+80), xmmDef);
- save128Aligned ((__m128i*)(d+96), xmmDef);
- save128Aligned ((__m128i*)(d+112), xmmDef);
-
- d += 128;
- w -= 128;
- }
+static void
+sse2_composite_add_8_8 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint8_t *dst_line, *dst;
+ uint8_t *src_line, *src;
+ int dst_stride, src_stride;
+ int32_t w;
+ uint16_t t;
- if (w >= 64)
- {
- cachePrefetch (((__m128i*)d) + 8);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
- save128Aligned ((__m128i*)(d), xmmDef);
- save128Aligned ((__m128i*)(d+16), xmmDef);
- save128Aligned ((__m128i*)(d+32), xmmDef);
- save128Aligned ((__m128i*)(d+48), xmmDef);
+ while (height--)
+ {
+ dst = dst_line;
+ src = src_line;
+
+ dst_line += dst_stride;
+ src_line += src_stride;
+ w = width;
+
+ /* Small head */
+ while (w && (unsigned long)dst & 3)
+ {
+ t = (*dst) + (*src++);
+ *dst++ = t | (0 - (t >> 8));
+ w--;
+ }
+
+ sse2_combine_add_u (imp, op,
+ (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
+
+ /* Small tail */
+ dst += w & 0xfffc;
+ src += w & 0xfffc;
+
+ w &= 3;
+
+ while (w)
+ {
+ t = (*dst) + (*src++);
+ *dst++ = t | (0 - (t >> 8));
+ w--;
+ }
+ }
- d += 64;
- w -= 64;
- }
+}
- cachePrefetchNext ((__m128i*)d);
+static void
+sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t *dst_line, *dst;
+ uint32_t *src_line, *src;
+ int dst_stride, src_stride;
- if (w >= 32)
- {
- save128Aligned ((__m128i*)(d), xmmDef);
- save128Aligned ((__m128i*)(d+16), xmmDef);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- d += 32;
- w -= 32;
- }
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
- if (w >= 16)
- {
- save128Aligned ((__m128i*)(d), xmmDef);
+ sse2_combine_add_u (imp, op, dst, src, NULL, width);
+ }
- d += 16;
- w -= 16;
- }
+}
- cachePrefetchNext ((__m128i*)d);
+static pixman_bool_t
+pixman_blt_sse2 (uint32_t *src_bits,
+ uint32_t *dst_bits,
+ int src_stride,
+ int dst_stride,
+ int src_bpp,
+ int dst_bpp,
+ int src_x,
+ int src_y,
+ int dest_x,
+ int dest_y,
+ int width,
+ int height)
+{
+ uint8_t * src_bytes;
+ uint8_t * dst_bytes;
+ int byte_width;
- while (w >= 4)
- {
- *(uint32_t *)d = data;
+ if (src_bpp != dst_bpp)
+ return FALSE;
- w -= 4;
- d += 4;
- }
+ if (src_bpp == 16)
+ {
+ src_stride = src_stride * (int) sizeof (uint32_t) / 2;
+ dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
+ src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
+ dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
+ byte_width = 2 * width;
+ src_stride *= 2;
+ dst_stride *= 2;
+ }
+ else if (src_bpp == 32)
+ {
+ src_stride = src_stride * (int) sizeof (uint32_t) / 4;
+ dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
+ src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
+ dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
+ byte_width = 4 * width;
+ src_stride *= 4;
+ dst_stride *= 4;
+ }
+ else
+ {
+ return FALSE;
+ }
- if (w >= 2)
- {
- *(uint16_t *)d = data;
- w -= 2;
- d += 2;
- }
+ while (height--)
+ {
+ int w;
+ uint8_t *s = src_bytes;
+ uint8_t *d = dst_bytes;
+ src_bytes += src_stride;
+ dst_bytes += dst_stride;
+ w = byte_width;
+
+ while (w >= 2 && ((unsigned long)d & 3))
+ {
+ *(uint16_t *)d = *(uint16_t *)s;
+ w -= 2;
+ s += 2;
+ d += 2;
+ }
+
+ while (w >= 4 && ((unsigned long)d & 15))
+ {
+ *(uint32_t *)d = *(uint32_t *)s;
+
+ w -= 4;
+ s += 4;
+ d += 4;
+ }
+
+ while (w >= 64)
+ {
+ __m128i xmm0, xmm1, xmm2, xmm3;
+
+ xmm0 = load_128_unaligned ((__m128i*)(s));
+ xmm1 = load_128_unaligned ((__m128i*)(s + 16));
+ xmm2 = load_128_unaligned ((__m128i*)(s + 32));
+ xmm3 = load_128_unaligned ((__m128i*)(s + 48));
+
+ save_128_aligned ((__m128i*)(d), xmm0);
+ save_128_aligned ((__m128i*)(d + 16), xmm1);
+ save_128_aligned ((__m128i*)(d + 32), xmm2);
+ save_128_aligned ((__m128i*)(d + 48), xmm3);
+
+ s += 64;
+ d += 64;
+ w -= 64;
+ }
+
+ while (w >= 16)
+ {
+ save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
+
+ w -= 16;
+ d += 16;
+ s += 16;
+ }
+
+ while (w >= 4)
+ {
+ *(uint32_t *)d = *(uint32_t *)s;
+
+ w -= 4;
+ s += 4;
+ d += 4;
+ }
+
+ if (w >= 2)
+ {
+ *(uint16_t *)d = *(uint16_t *)s;
+ w -= 2;
+ s += 2;
+ d += 2;
+ }
}
- _mm_empty();
+
return TRUE;
}
static void
-sse2_CompositeSrc_n_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int32_t xSrc,
- int32_t ySrc,
- int32_t xMask,
- int32_t yMask,
- int32_t xDst,
- int32_t yDst,
- int32_t width,
- int32_t height)
+sse2_composite_copy_area (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
{
- uint32_t src, srca;
- uint32_t *dstLine, *dst;
- uint8_t *maskLine, *mask;
- int dstStride, maskStride;
- uint16_t w;
- uint32_t m;
-
- __m128i xmmSrc, xmmDef;
- __m128i xmmMask, xmmMaskLo, xmmMaskHi;
-
- src = _pixman_image_get_solid(pSrc, pDst->bits.format);
+ PIXMAN_COMPOSITE_ARGS (info);
+ pixman_blt_sse2 (src_image->bits.bits,
+ dest_image->bits.bits,
+ src_image->bits.rowstride,
+ dest_image->bits.rowstride,
+ PIXMAN_FORMAT_BPP (src_image->bits.format),
+ PIXMAN_FORMAT_BPP (dest_image->bits.format),
+ src_x, src_y, dest_x, dest_y, width, height);
+}
- srca = src >> 24;
- if (src == 0)
- {
- pixmanFillsse2 (pDst->bits.bits, pDst->bits.rowstride,
- PIXMAN_FORMAT_BPP (pDst->bits.format),
- xDst, yDst, width, height, 0);
- return;
- }
+static void
+sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t *src, *src_line, s;
+ uint32_t *dst, *dst_line, d;
+ uint8_t *mask, *mask_line;
+ uint32_t m;
+ int src_stride, mask_stride, dst_stride;
+ int32_t w;
+ __m128i ms;
- fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
- fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- xmmDef = createMask_2x32_128 (src, src);
- xmmSrc = expandPixel_32_1x128 (src);
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
while (height--)
{
- dst = dstLine;
- dstLine += dstStride;
- mask = maskLine;
- maskLine += maskStride;
- w = width;
+ src = src_line;
+ src_line += src_stride;
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)mask);
- cachePrefetch ((__m128i*)dst);
+ w = width;
while (w && (unsigned long)dst & 15)
{
- uint8_t m = *mask++;
+ s = 0xff000000 | *src++;
+ m = (uint32_t) *mask++;
+ d = *dst;
+ ms = unpack_32_1x128 (s);
- if (m)
- {
- *dst = pack_1x64_32 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmSrc), expandPixel_8_1x64 (m)));
- }
- else
+ if (m != 0xff)
{
- *dst = 0;
+ __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+ __m128i md = unpack_32_1x128 (d);
+
+ ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
}
+ *dst++ = pack_1x128_32 (ms);
w--;
- dst++;
}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)mask);
- cachePrefetch ((__m128i*)dst);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)mask);
- cachePrefetchNext ((__m128i*)dst);
-
- m = *((uint32_t*)mask);
+ m = *(uint32_t*) mask;
+ xmm_src = _mm_or_si128 (
+ load_128_unaligned ((__m128i*)src), mask_ff000000);
- if (srca == 0xff && m == 0xffffffff)
+ if (m == 0xffffffff)
{
- save128Aligned ((__m128i*)dst, xmmDef);
+ save_128_aligned ((__m128i*)dst, xmm_src);
}
- else if (m)
+ else
{
- xmmMask = unpack_32_1x128 (m);
- xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
+ xmm_dst = load_128_aligned ((__m128i*)dst);
- /* Unpacking */
- unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
+ xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
- expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
- pixMultiply_2x128 (&xmmSrc, &xmmSrc, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
+ expand_alpha_rev_2x128 (
+ xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
- save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmMaskLo, xmmMaskHi));
- }
- else
- {
- save128Aligned ((__m128i*)dst, _mm_setzero_si128());
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
}
- w -= 4;
+ src += 4;
dst += 4;
mask += 4;
+ w -= 4;
}
while (w)
{
- uint8_t m = *mask++;
+ m = (uint32_t) *mask++;
if (m)
{
- *dst = pack_1x64_32 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmSrc), expandPixel_8_1x64 (m)));
- }
- else
- {
- *dst = 0;
+ s = 0xff000000 | *src;
+
+ if (m == 0xff)
+ {
+ *dst = s;
+ }
+ else
+ {
+ __m128i ma, md, ms;
+
+ d = *dst;
+
+ ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+ md = unpack_32_1x128 (d);
+ ms = unpack_32_1x128 (s);
+
+ *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
+ }
+
}
- w--;
+ src++;
dst++;
+ w--;
}
}
- _mm_empty();
}
-/* -------------------------------------------------------------------------------------------------
- * fast_CompositeOver_n_8_0565
- */
-
static void
-sse2_CompositeOver_n_8_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int32_t xSrc,
- int32_t ySrc,
- int32_t xMask,
- int32_t yMask,
- int32_t xDst,
- int32_t yDst,
- int32_t width,
- int32_t height)
+sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
{
- uint32_t src, srca;
- uint16_t *dstLine, *dst, d;
- uint8_t *maskLine, *mask;
- int dstStride, maskStride;
- uint16_t w;
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t *src, *src_line, s;
+ uint32_t *dst, *dst_line, d;
+ uint8_t *mask, *mask_line;
uint32_t m;
- __m64 mmxSrc, mmxAlpha, mmxMask, mmxDest;
+ int src_stride, mask_stride, dst_stride;
+ int32_t w;
- __m128i xmmSrc, xmmAlpha;
- __m128i xmmMask, xmmMaskLo, xmmMaskHi;
- __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
- src = _pixman_image_get_solid(pSrc, pDst->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
- fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
-
- xmmSrc = expandPixel_32_1x128 (src);
- xmmAlpha = expandAlpha_1x128 (xmmSrc);
- mmxSrc = _mm_movepi64_pi64 (xmmSrc);
- mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
while (height--)
{
- dst = dstLine;
- dstLine += dstStride;
- mask = maskLine;
- maskLine += maskStride;
- w = width;
+ src = src_line;
+ src_line += src_stride;
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)mask);
- cachePrefetch ((__m128i*)dst);
+ w = width;
while (w && (unsigned long)dst & 15)
{
- m = *mask++;
+ uint32_t sa;
- if (m)
- {
- d = *dst;
- mmxMask = expandAlphaRev_1x64 (unpack_32_1x64 (m));
- mmxDest = expand565_16_1x64 (d);
-
- *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
- &mmxAlpha,
- &mmxMask,
- &mmxDest)));
- }
+ s = *src++;
+ m = (uint32_t) *mask++;
+ d = *dst;
- w--;
- dst++;
- }
+ sa = s >> 24;
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)mask);
- cachePrefetch ((__m128i*)dst);
+ if (m)
+ {
+ if (sa == 0xff && m == 0xff)
+ {
+ *dst = s;
+ }
+ else
+ {
+ __m128i ms, md, ma, msa;
- while (w >= 8)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)mask);
- cachePrefetchNext ((__m128i*)dst);
+ ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+ ms = unpack_32_1x128 (s);
+ md = unpack_32_1x128 (d);
- xmmDst = load128Aligned ((__m128i*) dst);
- unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
+ msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
- m = *((uint32_t*)mask);
- mask += 4;
+ *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+ }
+ }
- if (m)
- {
- xmmMask = unpack_32_1x128 (m);
- xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
+ dst++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ m = *(uint32_t *) mask;
- /* Unpacking */
- unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
+ if (m)
+ {
+ xmm_src = load_128_unaligned ((__m128i*)src);
- expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
- inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst0, &xmmDst1);
- }
+ if (m == 0xffffffff && is_opaque (xmm_src))
+ {
+ save_128_aligned ((__m128i *)dst, xmm_src);
+ }
+ else
+ {
+ xmm_dst = load_128_aligned ((__m128i *)dst);
- m = *((uint32_t*)mask);
- mask += 4;
+ xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
- if (m)
- {
- xmmMask = unpack_32_1x128 (m);
- xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
- /* Unpacking */
- unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
- expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
- inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst2, &xmmDst3);
- }
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+ &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
- save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
+ save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+ }
- w -= 8;
- dst += 8;
+ src += 4;
+ dst += 4;
+ mask += 4;
+ w -= 4;
}
while (w)
{
- m = *mask++;
+ uint32_t sa;
- if (m)
- {
- d = *dst;
- mmxMask = expandAlphaRev_1x64 (unpack_32_1x64 (m));
- mmxDest = expand565_16_1x64 (d);
-
- *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
- &mmxAlpha,
- &mmxMask,
- &mmxDest)));
- }
+ s = *src++;
+ m = (uint32_t) *mask++;
+ d = *dst;
+
+ sa = s >> 24;
+
+ if (m)
+ {
+ if (sa == 0xff && m == 0xff)
+ {
+ *dst = s;
+ }
+ else
+ {
+ __m128i ms, md, ma, msa;
+ ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+ ms = unpack_32_1x128 (s);
+ md = unpack_32_1x128 (d);
+
+ msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+ *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+ }
+ }
+
+ dst++;
w--;
- dst++;
}
}
- _mm_empty();
}
-/* -------------------------------------------------------------------------------------------------
- * fast_Composite_over_pixbuf_0565
- */
-
static void
-sse2_Composite_over_pixbuf_0565 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int32_t xSrc,
- int32_t ySrc,
- int32_t xMask,
- int32_t yMask,
- int32_t xDst,
- int32_t yDst,
- int32_t width,
- int32_t height)
+sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
{
- uint16_t *dstLine, *dst, d;
- uint32_t *srcLine, *src, s;
- int dstStride, srcStride;
- uint16_t w;
- uint32_t opaque, zero;
-
- __m64 ms;
- __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
- __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
-
- fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
- fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
-
-#if 0
- /* FIXME
- *
- * I copy the code from MMX one and keep the fixme.
- * If it's a problem there, probably is a problem here.
- */
- assert (pSrc->pDrawable == pMask->pDrawable);
-#endif
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t src;
+ uint32_t *dst_line, *dst;
+ __m128i xmm_src;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_dsta_hi, xmm_dsta_lo;
+ int dst_stride;
+ int32_t w;
- while (height--)
- {
- dst = dstLine;
- dstLine += dstStride;
- src = srcLine;
- srcLine += srcStride;
- w = width;
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)src);
- cachePrefetch ((__m128i*)dst);
+ if (src == 0)
+ return;
- while (w && (unsigned long)dst & 15)
- {
- s = *src++;
- d = *dst;
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
- ms = unpack_32_1x64 (s);
+ xmm_src = expand_pixel_32_1x128 (src);
- *dst++ = pack565_32_16 (pack_1x64_32 (overRevNonPre_1x64(ms, expand565_16_1x64 (d))));
- w--;
- }
+ while (height--)
+ {
+ dst = dst_line;
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)src);
- cachePrefetch ((__m128i*)dst);
+ dst_line += dst_stride;
+ w = width;
- while (w >= 8)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)src);
- cachePrefetchNext ((__m128i*)dst);
+ while (w && (unsigned long)dst & 15)
+ {
+ __m128i vd;
- /* First round */
- xmmSrc = load128Unaligned((__m128i*)src);
- xmmDst = load128Aligned ((__m128i*)dst);
+ vd = unpack_32_1x128 (*dst);
- opaque = isOpaque (xmmSrc);
- zero = isZero (xmmSrc);
+ *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
+ xmm_src));
+ w--;
+ dst++;
+ }
- unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
- unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
+ while (w >= 4)
+ {
+ __m128i tmp_lo, tmp_hi;
- /* preload next round*/
- xmmSrc = load128Unaligned((__m128i*)(src+4));
-
- if (opaque)
- {
- invertColors_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst0, &xmmDst1);
- }
- else if (!zero)
- {
- overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst0, &xmmDst1);
- }
+ xmm_dst = load_128_aligned ((__m128i*)dst);
- /* Second round */
- opaque = isOpaque (xmmSrc);
- zero = isZero (xmmSrc);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+ expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
- unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
+ tmp_lo = xmm_src;
+ tmp_hi = xmm_src;
- if (opaque)
- {
- invertColors_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst2, &xmmDst3);
- }
- else if (zero)
- {
- overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst2, &xmmDst3);
- }
+ over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+ &xmm_dsta_lo, &xmm_dsta_hi,
+ &tmp_lo, &tmp_hi);
- save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
- w -= 8;
- src += 8;
- dst += 8;
- }
+ w -= 4;
+ dst += 4;
+ }
- while (w)
- {
- s = *src++;
- d = *dst;
+ while (w)
+ {
+ __m128i vd;
- ms = unpack_32_1x64 (s);
+ vd = unpack_32_1x128 (*dst);
+
+ *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
+ xmm_src));
+ w--;
+ dst++;
+ }
- *dst++ = pack565_32_16 (pack_1x64_32 (overRevNonPre_1x64(ms, expand565_16_1x64 (d))));
- w--;
- }
}
- _mm_empty();
}
-/* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */
-
-/* -------------------------------------------------------------------------------------------------
- * fast_Composite_over_pixbuf_8888
- */
-
static void
-sse2_Composite_over_pixbuf_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int32_t xSrc,
- int32_t ySrc,
- int32_t xMask,
- int32_t yMask,
- int32_t xDst,
- int32_t yDst,
- int32_t width,
- int32_t height)
+sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
{
- uint32_t *dstLine, *dst, d;
- uint32_t *srcLine, *src, s;
- int dstStride, srcStride;
- uint16_t w;
- uint32_t opaque, zero;
-
- __m128i xmmSrcLo, xmmSrcHi;
- __m128i xmmDstLo, xmmDstHi;
-
- fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
- fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
-
-#if 0
- /* FIXME
- *
- * I copy the code from MMX one and keep the fixme.
- * If it's a problem there, probably is a problem here.
- */
- assert (pSrc->pDrawable == pMask->pDrawable);
-#endif
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t *src, *src_line, s;
+ uint32_t *dst, *dst_line, d;
+ uint32_t *mask, *mask_line;
+ uint32_t m;
+ int src_stride, mask_stride, dst_stride;
+ int32_t w;
+
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
while (height--)
{
- dst = dstLine;
- dstLine += dstStride;
- src = srcLine;
- srcLine += srcStride;
- w = width;
+ src = src_line;
+ src_line += src_stride;
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)src);
- cachePrefetch ((__m128i*)dst);
+ w = width;
while (w && (unsigned long)dst & 15)
{
+ uint32_t sa;
+
s = *src++;
+ m = (*mask++) >> 24;
d = *dst;
- *dst++ = pack_1x64_32 (overRevNonPre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
+ sa = s >> 24;
+
+ if (m)
+ {
+ if (sa == 0xff && m == 0xff)
+ {
+ *dst = s;
+ }
+ else
+ {
+ __m128i ms, md, ma, msa;
+
+ ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+ ms = unpack_32_1x128 (s);
+ md = unpack_32_1x128 (d);
+
+ msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+ *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+ }
+ }
+ dst++;
w--;
}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)src);
- cachePrefetch ((__m128i*)dst);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)src);
- cachePrefetchNext ((__m128i*)dst);
-
- xmmSrcHi = load128Unaligned((__m128i*)src);
+ xmm_mask = load_128_unaligned ((__m128i*)mask);
- opaque = isOpaque (xmmSrcHi);
- zero = isZero (xmmSrcHi);
+ if (!is_transparent (xmm_mask))
+ {
+ xmm_src = load_128_unaligned ((__m128i*)src);
- unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
-
- if (opaque)
- {
- invertColors_2x128( xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi);
+ if (is_opaque (xmm_mask) && is_opaque (xmm_src))
+ {
+ save_128_aligned ((__m128i *)dst, xmm_src);
+ }
+ else
+ {
+ xmm_dst = load_128_aligned ((__m128i *)dst);
- save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
- }
- else if (!zero)
- {
- xmmDstHi = load128Aligned ((__m128i*)dst);
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
- unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+ expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
- overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi);
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+ &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
- save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
- }
+ save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+ }
- w -= 4;
- dst += 4;
src += 4;
+ dst += 4;
+ mask += 4;
+ w -= 4;
}
while (w)
{
+ uint32_t sa;
+
s = *src++;
+ m = (*mask++) >> 24;
d = *dst;
- *dst++ = pack_1x64_32 (overRevNonPre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
+ sa = s >> 24;
+
+ if (m)
+ {
+ if (sa == 0xff && m == 0xff)
+ {
+ *dst = s;
+ }
+ else
+ {
+ __m128i ms, md, ma, msa;
+
+ ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+ ms = unpack_32_1x128 (s);
+ md = unpack_32_1x128 (d);
+
+ msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+ *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+ }
+ }
+
+ dst++;
w--;
}
}
- _mm_empty();
}
-/* -------------------------------------------------------------------------------------------------
- * fast_CompositeOver_n_8888_0565_ca
- */
-
-static void
-sse2_CompositeOver_n_8888_0565_ca (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int32_t xSrc,
- int32_t ySrc,
- int32_t xMask,
- int32_t yMask,
- int32_t xDst,
- int32_t yDst,
- int32_t width,
- int32_t height)
+/* A variant of 'sse2_combine_over_u' with minor tweaks */
+static force_inline void
+scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
+ const uint32_t* ps,
+ int32_t w,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t fully_transparent_src)
{
- uint32_t src;
- uint16_t *dstLine, *dst, d;
- uint32_t *maskLine, *mask, m;
- int dstStride, maskStride;
- int w;
- uint32_t packCmp;
-
- __m128i xmmSrc, xmmAlpha;
- __m128i xmmMask, xmmMaskLo, xmmMaskHi;
- __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
-
- __m64 mmxSrc, mmxAlpha, mmxMask, mmxDest;
+ uint32_t s, d;
+ const uint32_t* pm = NULL;
- src = _pixman_image_get_solid(pSrc, pDst->bits.format);
+ __m128i xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_src_lo, xmm_src_hi;
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
- if (src == 0)
- return;
+ if (fully_transparent_src)
+ return;
- fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
- fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1);
+ /* Align dst on a 16-byte boundary */
+ while (w && ((unsigned long)pd & 15))
+ {
+ d = *pd;
+ s = combine1 (ps + (vx >> 16), pm);
+ vx += unit_x;
- xmmSrc = expandPixel_32_1x128 (src);
- xmmAlpha = expandAlpha_1x128 (xmmSrc);
- mmxSrc = _mm_movepi64_pi64 (xmmSrc);
- mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
+ *pd++ = core_combine_over_u_pixel_sse2 (s, d);
+ if (pm)
+ pm++;
+ w--;
+ }
- while (height--)
+ while (w >= 4)
{
- w = width;
- mask = maskLine;
- dst = dstLine;
- maskLine += maskStride;
- dstLine += dstStride;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)mask);
- cachePrefetch ((__m128i*)dst);
+ __m128i tmp;
+ uint32_t tmp1, tmp2, tmp3, tmp4;
+
+ tmp1 = ps[vx >> 16];
+ vx += unit_x;
+ tmp2 = ps[vx >> 16];
+ vx += unit_x;
+ tmp3 = ps[vx >> 16];
+ vx += unit_x;
+ tmp4 = ps[vx >> 16];
+ vx += unit_x;
+
+ tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
+
+ xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
+
+ if (is_opaque (xmm_src_hi))
+ {
+ save_128_aligned ((__m128i*)pd, xmm_src_hi);
+ }
+ else if (!is_zero (xmm_src_hi))
+ {
+ xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+ unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (
+ xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+
+ over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ /* rebuid the 4 pixel data and save*/
+ save_128_aligned ((__m128i*)pd,
+ pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ w -= 4;
+ pd += 4;
+ if (pm)
+ pm += 4;
+ }
- while (w && ((unsigned long)dst & 15))
- {
- m = *(uint32_t *) mask;
+ while (w)
+ {
+ d = *pd;
+ s = combine1 (ps + (vx >> 16), pm);
+ vx += unit_x;
- if (m)
- {
- d = *dst;
- mmxMask = unpack_32_1x64 (m);
- mmxDest = expand565_16_1x64 (d);
-
- *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
- &mmxAlpha,
- &mmxMask,
- &mmxDest)));
- }
+ *pd++ = core_combine_over_u_pixel_sse2 (s, d);
+ if (pm)
+ pm++;
- w--;
- dst++;
- mask++;
- }
+ w--;
+ }
+}
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)mask);
- cachePrefetch ((__m128i*)dst);
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
+ scaled_nearest_scanline_sse2_8888_8888_OVER,
+ uint32_t, uint32_t, COVER)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
+ scaled_nearest_scanline_sse2_8888_8888_OVER,
+ uint32_t, uint32_t, NONE)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
+ scaled_nearest_scanline_sse2_8888_8888_OVER,
+ uint32_t, uint32_t, PAD)
- while (w >= 8)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)mask);
- cachePrefetchNext ((__m128i*)dst);
+static force_inline void
+scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
+ uint32_t * dst,
+ const uint32_t * src,
+ int32_t w,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t zero_src)
+{
+ __m128i xmm_mask;
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+ if (zero_src || (*mask >> 24) == 0)
+ return;
- /* First round */
- xmmMask = load128Unaligned((__m128i*)mask);
- xmmDst = load128Aligned((__m128i*)dst);
+ xmm_mask = create_mask_16_128 (*mask >> 24);
- packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
+ while (w && (unsigned long)dst & 15)
+ {
+ uint32_t s = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+
+ if (s)
+ {
+ uint32_t d = *dst;
+
+ __m128i ms = unpack_32_1x128 (s);
+ __m128i alpha = expand_alpha_1x128 (ms);
+ __m128i dest = xmm_mask;
+ __m128i alpha_dst = unpack_32_1x128 (d);
+
+ *dst = pack_1x128_32 (
+ in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+ }
+ dst++;
+ w--;
+ }
- unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
- unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
+ while (w >= 4)
+ {
+ uint32_t tmp1, tmp2, tmp3, tmp4;
+
+ tmp1 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+ tmp2 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+ tmp3 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+ tmp4 = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
+
+ xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
+
+ if (!is_zero (xmm_src))
+ {
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+ &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_mask, &xmm_mask,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned (
+ (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ dst += 4;
+ w -= 4;
+ }
- /* preload next round*/
- xmmMask = load128Unaligned((__m128i*)(mask+4));
- /* preload next round*/
+ while (w)
+ {
+ uint32_t s = src[pixman_fixed_to_int (vx)];
+ vx += unit_x;
- if (packCmp != 0xffff)
- {
- inOver_2x128(&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst0, &xmmDst1);
- }
+ if (s)
+ {
+ uint32_t d = *dst;
- /* Second round */
- packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
+ __m128i ms = unpack_32_1x128 (s);
+ __m128i alpha = expand_alpha_1x128 (ms);
+ __m128i mask = xmm_mask;
+ __m128i dest = unpack_32_1x128 (d);
- unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
+ *dst = pack_1x128_32 (
+ in_over_1x128 (&ms, &alpha, &mask, &dest));
+ }
- if (packCmp != 0xffff)
- {
- inOver_2x128(&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst2, &xmmDst3);
- }
+ dst++;
+ w--;
+ }
- save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
+}
- w -= 8;
- dst += 8;
- mask += 8;
- }
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
+ scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+ uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
+ scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+ uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
+ scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+ uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
+
+#define BILINEAR_DECLARE_VARIABLES \
+ const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \
+ const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \
+ const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);\
+ const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1); \
+ const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x, \
+ unit_x, unit_x, unit_x, unit_x); \
+ const __m128i xmm_zero = _mm_setzero_si128 (); \
+ __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx)
+
+#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \
+do { \
+ __m128i xmm_wh, xmm_lo, xmm_hi, a; \
+ /* fetch 2x2 pixel block into sse2 register */ \
+ uint32_t tl = src_top [pixman_fixed_to_int (vx)]; \
+ uint32_t tr = src_top [pixman_fixed_to_int (vx) + 1]; \
+ uint32_t bl = src_bottom [pixman_fixed_to_int (vx)]; \
+ uint32_t br = src_bottom [pixman_fixed_to_int (vx) + 1]; \
+ a = _mm_set_epi32 (tr, tl, br, bl); \
+ vx += unit_x; \
+ /* vertical interpolation */ \
+ a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero), \
+ xmm_wt), \
+ _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero), \
+ xmm_wb)); \
+ /* calculate horizontal weights */ \
+ xmm_wh = _mm_add_epi16 (xmm_addc, \
+ _mm_xor_si128 (xmm_xorc, \
+ _mm_srli_epi16 (xmm_x, 8))); \
+ xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
+ /* horizontal interpolation */ \
+ xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \
+ xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \
+ a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \
+ _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \
+ /* shift and pack the result */ \
+ a = _mm_srli_epi32 (a, 16); \
+ a = _mm_packs_epi32 (a, a); \
+ a = _mm_packus_epi16 (a, a); \
+ pix = _mm_cvtsi128_si32 (a); \
+} while (0)
+
+#define BILINEAR_SKIP_ONE_PIXEL() \
+do { \
+ vx += unit_x; \
+ xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
+} while(0)
- while (w)
- {
- m = *(uint32_t *) mask;
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst,
+ const uint32_t * mask,
+ const uint32_t * src_top,
+ const uint32_t * src_bottom,
+ int32_t w,
+ int wt,
+ int wb,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t zero_src)
+{
+ BILINEAR_DECLARE_VARIABLES;
+ uint32_t pix1, pix2, pix3, pix4;
+
+ while ((w -= 4) >= 0)
+ {
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+ *dst++ = pix1;
+ *dst++ = pix2;
+ *dst++ = pix3;
+ *dst++ = pix4;
+ }
- if (m)
- {
- d = *dst;
- mmxMask = unpack_32_1x64 (m);
- mmxDest = expand565_16_1x64 (d);
-
- *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
- &mmxAlpha,
- &mmxMask,
- &mmxDest)));
- }
+ if (w & 2)
+ {
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+ *dst++ = pix1;
+ *dst++ = pix2;
+ }
- w--;
- dst++;
- mask++;
- }
+ if (w & 1)
+ {
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+ *dst = pix1;
}
- _mm_empty ();
}
-/* -------------------------------------------------------------------------------------------------
- * fast_CompositeIn_n_8_8
- */
-
-static void
-sse2_CompositeIn_n_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int32_t xSrc,
- int32_t ySrc,
- int32_t xMask,
- int32_t yMask,
- int32_t xDst,
- int32_t yDst,
- int32_t width,
- int32_t height)
-{
- uint8_t *dstLine, *dst;
- uint8_t *maskLine, *mask;
- int dstStride, maskStride;
- uint16_t w, d, m;
- uint32_t src;
- uint8_t sa;
-
- __m128i xmmAlpha;
- __m128i xmmMask, xmmMaskLo, xmmMaskHi;
- __m128i xmmDst, xmmDstLo, xmmDstHi;
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
+ scaled_bilinear_scanline_sse2_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
+ scaled_bilinear_scanline_sse2_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
+ scaled_bilinear_scanline_sse2_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
+ scaled_bilinear_scanline_sse2_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ NORMAL, FLAG_NONE)
- fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
- fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst,
+ const uint32_t * mask,
+ const uint32_t * src_top,
+ const uint32_t * src_bottom,
+ int32_t w,
+ int wt,
+ int wb,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t zero_src)
+{
+ BILINEAR_DECLARE_VARIABLES;
+ uint32_t pix1, pix2, pix3, pix4;
+
+ while (w && ((unsigned long)dst & 15))
+ {
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
- src = _pixman_image_get_solid(pSrc, pDst->bits.format);
+ if (pix1)
+ {
+ pix2 = *dst;
+ *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
+ }
- sa = src >> 24;
- if (sa == 0)
- return;
+ w--;
+ dst++;
+ }
- xmmAlpha = expandAlpha_1x128 (expandPixel_32_1x128 (src));
+ while (w >= 4)
+ {
+ __m128i xmm_src;
+ __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
+ __m128i xmm_alpha_hi, xmm_alpha_lo;
+
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+
+ xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
+
+ if (!is_zero (xmm_src))
+ {
+ if (is_opaque (xmm_src))
+ {
+ save_128_aligned ((__m128i *)dst, xmm_src);
+ }
+ else
+ {
+ __m128i xmm_dst = load_128_aligned ((__m128i *)dst);
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+ over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
+ &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+ }
+
+ w -= 4;
+ dst += 4;
+ }
- while (height--)
+ while (w)
{
- dst = dstLine;
- dstLine += dstStride;
- mask = maskLine;
- maskLine += maskStride;
- w = width;
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)mask);
- cachePrefetch ((__m128i*)dst);
+ if (pix1)
+ {
+ pix2 = *dst;
+ *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
+ }
- while (w && ((unsigned long)dst & 15))
- {
- m = (uint32_t) *mask++;
- d = (uint32_t) *dst;
+ w--;
+ dst++;
+ }
+}
- *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
- w--;
- }
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
+ scaled_bilinear_scanline_sse2_8888_8888_OVER,
+ uint32_t, uint32_t, uint32_t,
+ COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
+ scaled_bilinear_scanline_sse2_8888_8888_OVER,
+ uint32_t, uint32_t, uint32_t,
+ PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
+ scaled_bilinear_scanline_sse2_8888_8888_OVER,
+ uint32_t, uint32_t, uint32_t,
+ NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
+ scaled_bilinear_scanline_sse2_8888_8888_OVER,
+ uint32_t, uint32_t, uint32_t,
+ NORMAL, FLAG_NONE)
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)mask);
- cachePrefetch ((__m128i*)dst);
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst,
+ const uint8_t * mask,
+ const uint32_t * src_top,
+ const uint32_t * src_bottom,
+ int32_t w,
+ int wt,
+ int wb,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t zero_src)
+{
+ BILINEAR_DECLARE_VARIABLES;
+ uint32_t pix1, pix2, pix3, pix4;
+ uint32_t m;
- while (w >= 16)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)mask);
- cachePrefetchNext ((__m128i*)dst);
+ while (w && ((unsigned long)dst & 15))
+ {
+ uint32_t sa;
+
+ m = (uint32_t) *mask++;
+
+ if (m)
+ {
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+ sa = pix1 >> 24;
+
+ if (sa == 0xff && m == 0xff)
+ {
+ *dst = pix1;
+ }
+ else
+ {
+ __m128i ms, md, ma, msa;
+
+ pix2 = *dst;
+ ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+ ms = unpack_32_1x128 (pix1);
+ md = unpack_32_1x128 (pix2);
+
+ msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+ *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+ }
+ }
+ else
+ {
+ BILINEAR_SKIP_ONE_PIXEL ();
+ }
+
+ w--;
+ dst++;
+ }
- xmmMask = load128Unaligned((__m128i*)mask);
- xmmDst = load128Aligned((__m128i*)dst);
+ while (w >= 4)
+ {
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+ m = *(uint32_t*)mask;
+
+ if (m)
+ {
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+
+ xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
+
+ if (m == 0xffffffff && is_opaque (xmm_src))
+ {
+ save_128_aligned ((__m128i *)dst, xmm_src);
+ }
+ else
+ {
+ xmm_dst = load_128_aligned ((__m128i *)dst);
+
+ xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+ &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+ }
+ else
+ {
+ BILINEAR_SKIP_ONE_PIXEL ();
+ BILINEAR_SKIP_ONE_PIXEL ();
+ BILINEAR_SKIP_ONE_PIXEL ();
+ BILINEAR_SKIP_ONE_PIXEL ();
+ }
+
+ w -= 4;
+ dst += 4;
+ mask += 4;
+ }
- unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
- unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
+ while (w)
+ {
+ uint32_t sa;
+
+ m = (uint32_t) *mask++;
+
+ if (m)
+ {
+ BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+ sa = pix1 >> 24;
+
+ if (sa == 0xff && m == 0xff)
+ {
+ *dst = pix1;
+ }
+ else
+ {
+ __m128i ms, md, ma, msa;
+
+ pix2 = *dst;
+ ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+ ms = unpack_32_1x128 (pix1);
+ md = unpack_32_1x128 (pix2);
+
+ msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+ *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+ }
+ }
+ else
+ {
+ BILINEAR_SKIP_ONE_PIXEL ();
+ }
+
+ w--;
+ dst++;
+ }
+}
- pixMultiply_2x128 (&xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
- pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
+ scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+ uint32_t, uint8_t, uint32_t,
+ COVER, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
+ scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+ uint32_t, uint8_t, uint32_t,
+ PAD, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
+ scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+ uint32_t, uint8_t, uint32_t,
+ NONE, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
+ scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+ uint32_t, uint8_t, uint32_t,
+ NORMAL, FLAG_HAVE_NON_SOLID_MASK)
- save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
+static const pixman_fast_path_t sse2_fast_paths[] =
+{
+ /* PIXMAN_OP_OVER */
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
+ PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
+ PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
+ PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
+ PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
+ PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
+ PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
+ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+
+ /* PIXMAN_OP_OVER_REVERSE */
+ PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
+
+ /* PIXMAN_OP_ADD */
+ PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
+ PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
+ PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
+ PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
+
+ /* PIXMAN_OP_SRC */
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
+ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
+ PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
+
+ /* PIXMAN_OP_IN */
+ PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
+ PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
+ PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
+
+ SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+ SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
+ SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
+
+ SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
- mask += 16;
- dst += 16;
- w -= 16;
- }
+ { PIXMAN_OP_NONE },
+};
- while (w)
- {
- m = (uint32_t) *mask++;
- d = (uint32_t) *dst;
+static pixman_bool_t
+sse2_blt (pixman_implementation_t *imp,
+ uint32_t * src_bits,
+ uint32_t * dst_bits,
+ int src_stride,
+ int dst_stride,
+ int src_bpp,
+ int dst_bpp,
+ int src_x,
+ int src_y,
+ int dest_x,
+ int dest_y,
+ int width,
+ int height)
+{
+ if (!pixman_blt_sse2 (
+ src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+ src_x, src_y, dest_x, dest_y, width, height))
- *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
- w--;
- }
+ {
+ return _pixman_implementation_blt (
+ imp->delegate,
+ src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+ src_x, src_y, dest_x, dest_y, width, height);
}
- _mm_empty();
+ return TRUE;
}
-/* -------------------------------------------------------------------------------------------------
- * fast_CompositeIn_8_8
- */
-
-static void
-sse2_CompositeIn_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int32_t xSrc,
- int32_t ySrc,
- int32_t xMask,
- int32_t yMask,
- int32_t xDst,
- int32_t yDst,
- int32_t width,
- int32_t height)
-{
- uint8_t *dstLine, *dst;
- uint8_t *srcLine, *src;
- int srcStride, dstStride;
- uint16_t w;
- uint32_t s, d;
-
- __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
- __m128i xmmDst, xmmDstLo, xmmDstHi;
-
- fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
- fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
-
- while (height--)
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+static pixman_bool_t
+sse2_fill (pixman_implementation_t *imp,
+ uint32_t * bits,
+ int stride,
+ int bpp,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t xor)
+{
+ if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
{
- dst = dstLine;
- dstLine += dstStride;
- src = srcLine;
- srcLine += srcStride;
- w = width;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)src);
- cachePrefetch ((__m128i*)dst);
-
- while (w && ((unsigned long)dst & 15))
- {
- s = (uint32_t) *src++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d)));
- w--;
- }
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)src);
- cachePrefetch ((__m128i*)dst);
-
- while (w >= 16)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)src);
- cachePrefetchNext ((__m128i*)dst);
-
- xmmSrc = load128Unaligned((__m128i*)src);
- xmmDst = load128Aligned((__m128i*)dst);
-
- unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
-
- pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
-
- save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
- src += 16;
- dst += 16;
- w -= 16;
- }
-
- while (w)
- {
- s = (uint32_t) *src++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d)));
- w--;
- }
+ return _pixman_implementation_fill (
+ imp->delegate, bits, stride, bpp, x, y, width, height, xor);
}
- _mm_empty ();
+ return TRUE;
}
-/* -------------------------------------------------------------------------------------------------
- * fast_CompositeAdd_8888_8_8
- */
-
-static void
-sse2_CompositeAdd_8888_8_8 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int32_t xSrc,
- int32_t ySrc,
- int32_t xMask,
- int32_t yMask,
- int32_t xDst,
- int32_t yDst,
- int32_t width,
- int32_t height)
+static uint32_t *
+sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
{
- uint8_t *dstLine, *dst;
- uint8_t *maskLine, *mask;
- int dstStride, maskStride;
- uint16_t w;
- uint32_t src;
- uint8_t sa;
- uint32_t m, d;
-
- __m128i xmmAlpha;
- __m128i xmmMask, xmmMaskLo, xmmMaskHi;
- __m128i xmmDst, xmmDstLo, xmmDstHi;
-
- fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
- fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+ int w = iter->width;
+ __m128i ff000000 = mask_ff000000;
+ uint32_t *dst = iter->buffer;
+ uint32_t *src = (uint32_t *)iter->bits;
- src = _pixman_image_get_solid(pSrc, pDst->bits.format);
+ iter->bits += iter->stride;
- sa = src >> 24;
- if (sa == 0)
- return;
-
- xmmAlpha = expandAlpha_1x128 (expandPixel_32_1x128 (src));
-
- while (height--)
+ while (w && ((unsigned long)dst) & 0x0f)
{
- dst = dstLine;
- dstLine += dstStride;
- mask = maskLine;
- maskLine += maskStride;
- w = width;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)mask);
- cachePrefetch ((__m128i*)dst);
-
- while (w && ((unsigned long)dst & 15))
- {
- m = (uint32_t) *mask++;
- d = (uint32_t) *dst;
-
- *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
- w--;
- }
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)mask);
- cachePrefetch ((__m128i*)dst);
-
- while (w >= 16)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)mask);
- cachePrefetchNext ((__m128i*)dst);
-
- xmmMask = load128Unaligned((__m128i*)mask);
- xmmDst = load128Aligned((__m128i*)dst);
-
- unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
- unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
-
- pixMultiply_2x128 (&xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
- xmmDstLo = _mm_adds_epu16 (xmmMaskLo, xmmDstLo);
- xmmDstHi = _mm_adds_epu16 (xmmMaskHi, xmmDstHi);
-
- save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
+ *dst++ = (*src++) | 0xff000000;
+ w--;
+ }
- mask += 16;
- dst += 16;
- w -= 16;
- }
+ while (w >= 4)
+ {
+ save_128_aligned (
+ (__m128i *)dst, _mm_or_si128 (
+ load_128_unaligned ((__m128i *)src), ff000000));
- while (w)
- {
- m = (uint32_t) *mask++;
- d = (uint32_t) *dst;
+ dst += 4;
+ src += 4;
+ w -= 4;
+ }
- *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
- w--;
- }
+ while (w)
+ {
+ *dst++ = (*src++) | 0xff000000;
+ w--;
}
- _mm_empty();
+ return iter->buffer;
}
-/* -------------------------------------------------------------------------------------------------
- * fast_CompositeAdd_8000_8000
- */
-
-static void
-sse2_CompositeAdd_8000_8000 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int32_t xSrc,
- int32_t ySrc,
- int32_t xMask,
- int32_t yMask,
- int32_t xDst,
- int32_t yDst,
- int32_t width,
- int32_t height)
+static uint32_t *
+sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
{
- uint8_t *dstLine, *dst;
- uint8_t *srcLine, *src;
- int dstStride, srcStride;
- uint16_t w;
- uint16_t t;
+ int w = iter->width;
+ uint32_t *dst = iter->buffer;
+ uint16_t *src = (uint16_t *)iter->bits;
+ __m128i ff000000 = mask_ff000000;
- fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
- fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
+ iter->bits += iter->stride;
- while (height--)
+ while (w && ((unsigned long)dst) & 0x0f)
{
- dst = dstLine;
- src = srcLine;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)src);
- cachePrefetch ((__m128i*)dst);
+ uint16_t s = *src++;
- dstLine += dstStride;
- srcLine += srcStride;
- w = width;
+ *dst++ = CONVERT_0565_TO_8888 (s);
+ w--;
+ }
- /* Small head */
- while (w && (unsigned long)dst & 3)
- {
- t = (*dst) + (*src++);
- *dst++ = t | (0 - (t >> 8));
- w--;
- }
+ while (w >= 8)
+ {
+ __m128i lo, hi, s;
- coreCombineAddUsse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
+ s = _mm_loadu_si128 ((__m128i *)src);
- /* Small tail */
- dst += w & 0xfffc;
- src += w & 0xfffc;
+ lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
+ hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
- w &= 3;
+ save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
+ save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
- while (w)
- {
- t = (*dst) + (*src++);
- *dst++ = t | (0 - (t >> 8));
- w--;
- }
+ dst += 8;
+ src += 8;
+ w -= 8;
}
- _mm_empty();
-}
-
-/* -------------------------------------------------------------------------------------------------
- * fast_CompositeAdd_8888_8888
- */
-static void
-sse2_CompositeAdd_8888_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int32_t xSrc,
- int32_t ySrc,
- int32_t xMask,
- int32_t yMask,
- int32_t xDst,
- int32_t yDst,
- int32_t width,
- int32_t height)
-{
- uint32_t *dstLine, *dst;
- uint32_t *srcLine, *src;
- int dstStride, srcStride;
-
- fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
- fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-
- while (height--)
+ while (w)
{
- dst = dstLine;
- dstLine += dstStride;
- src = srcLine;
- srcLine += srcStride;
+ uint16_t s = *src++;
- coreCombineAddUsse2 (dst, src, NULL, width);
+ *dst++ = CONVERT_0565_TO_8888 (s);
+ w--;
}
- _mm_empty();
+ return iter->buffer;
}
-/* -------------------------------------------------------------------------------------------------
- * sse2_CompositeCopyArea
- */
-
-static pixman_bool_t
-pixmanBltsse2 (uint32_t *src_bits,
- uint32_t *dst_bits,
- int src_stride,
- int dst_stride,
- int src_bpp,
- int dst_bpp,
- int src_x, int src_y,
- int dst_x, int dst_y,
- int width, int height)
+static uint32_t *
+sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
{
- uint8_t * src_bytes;
- uint8_t * dst_bytes;
- int byte_width;
+ int w = iter->width;
+ uint32_t *dst = iter->buffer;
+ uint8_t *src = iter->bits;
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
- if (src_bpp != dst_bpp)
- return FALSE;
+ iter->bits += iter->stride;
- if (src_bpp == 16)
- {
- src_stride = src_stride * (int) sizeof (uint32_t) / 2;
- dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
- src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
- dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
- byte_width = 2 * width;
- src_stride *= 2;
- dst_stride *= 2;
- }
- else if (src_bpp == 32)
+ while (w && (((unsigned long)dst) & 15))
{
- src_stride = src_stride * (int) sizeof (uint32_t) / 4;
- dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
- src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
- dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
- byte_width = 4 * width;
- src_stride *= 4;
- dst_stride *= 4;
- }
- else
- {
- return FALSE;
+ *dst++ = *(src++) << 24;
+ w--;
}
- cachePrefetch ((__m128i*)src_bytes);
- cachePrefetch ((__m128i*)dst_bytes);
-
- while (height--)
+ while (w >= 16)
{
- int w;
- uint8_t *s = src_bytes;
- uint8_t *d = dst_bytes;
- src_bytes += src_stride;
- dst_bytes += dst_stride;
- w = byte_width;
-
- cachePrefetchNext ((__m128i*)s);
- cachePrefetchNext ((__m128i*)d);
-
- while (w >= 2 && ((unsigned long)d & 3))
- {
- *(uint16_t *)d = *(uint16_t *)s;
- w -= 2;
- s += 2;
- d += 2;
- }
-
- while (w >= 4 && ((unsigned long)d & 15))
- {
- *(uint32_t *)d = *(uint32_t *)s;
-
- w -= 4;
- s += 4;
- d += 4;
- }
-
- cachePrefetchNext ((__m128i*)s);
- cachePrefetchNext ((__m128i*)d);
-
- while (w >= 64)
- {
- __m128i xmm0, xmm1, xmm2, xmm3;
-
- /* 128 bytes ahead */
- cachePrefetch (((__m128i*)s) + 8);
- cachePrefetch (((__m128i*)d) + 8);
-
- xmm0 = load128Unaligned ((__m128i*)(s));
- xmm1 = load128Unaligned ((__m128i*)(s+16));
- xmm2 = load128Unaligned ((__m128i*)(s+32));
- xmm3 = load128Unaligned ((__m128i*)(s+48));
-
- save128Aligned ((__m128i*)(d), xmm0);
- save128Aligned ((__m128i*)(d+16), xmm1);
- save128Aligned ((__m128i*)(d+32), xmm2);
- save128Aligned ((__m128i*)(d+48), xmm3);
-
- s += 64;
- d += 64;
- w -= 64;
- }
-
- cachePrefetchNext ((__m128i*)s);
- cachePrefetchNext ((__m128i*)d);
-
- while (w >= 16)
- {
- save128Aligned ((__m128i*)d, load128Unaligned ((__m128i*)s) );
-
- w -= 16;
- d += 16;
- s += 16;
- }
-
- cachePrefetchNext ((__m128i*)s);
- cachePrefetchNext ((__m128i*)d);
-
- while (w >= 4)
- {
- *(uint32_t *)d = *(uint32_t *)s;
-
- w -= 4;
- s += 4;
- d += 4;
- }
-
- if (w >= 2)
- {
- *(uint16_t *)d = *(uint16_t *)s;
- w -= 2;
- s += 2;
- d += 2;
- }
+ xmm0 = _mm_loadu_si128((__m128i *)src);
+
+ xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0);
+ xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0);
+ xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
+ xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
+ xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
+ xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
+
+ _mm_store_si128(((__m128i *)(dst + 0)), xmm3);
+ _mm_store_si128(((__m128i *)(dst + 4)), xmm4);
+ _mm_store_si128(((__m128i *)(dst + 8)), xmm5);
+ _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
+
+ dst += 16;
+ src += 16;
+ w -= 16;
}
- _mm_empty();
-
- return TRUE;
-}
-
-static void
-sse2_CompositeCopyArea (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int32_t xSrc,
- int32_t ySrc,
- int32_t xMask,
- int32_t yMask,
- int32_t xDst,
- int32_t yDst,
- int32_t width,
- int32_t height)
-{
- pixmanBltsse2 (pSrc->bits.bits,
- pDst->bits.bits,
- pSrc->bits.rowstride,
- pDst->bits.rowstride,
- PIXMAN_FORMAT_BPP (pSrc->bits.format),
- PIXMAN_FORMAT_BPP (pDst->bits.format),
- xSrc, ySrc, xDst, yDst, width, height);
-}
-
-#if 0
-/* This code are buggy in MMX version, now the bug was translated to SSE2 version */
-void
-sse2_CompositeOver_x888_8_8888 (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int32_t xSrc,
- int32_t ySrc,
- int32_t xMask,
- int32_t yMask,
- int32_t xDst,
- int32_t yDst,
- int32_t width,
- int32_t height)
-{
- uint32_t *src, *srcLine, s;
- uint32_t *dst, *dstLine, d;
- uint8_t *mask, *maskLine;
- uint32_t m;
- int srcStride, maskStride, dstStride;
- uint16_t w;
-
- __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
- __m128i xmmDst, xmmDstLo, xmmDstHi;
- __m128i xmmMask, xmmMaskLo, xmmMaskHi;
-
- fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
- fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
- fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
-
- while (height--)
+ while (w)
{
- src = srcLine;
- srcLine += srcStride;
- dst = dstLine;
- dstLine += dstStride;
- mask = maskLine;
- maskLine += maskStride;
-
- w = width;
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)src);
- cachePrefetch ((__m128i*)dst);
- cachePrefetch ((__m128i*)mask);
-
- while (w && (unsigned long)dst & 15)
- {
- s = 0xff000000 | *src++;
- m = (uint32_t) *mask++;
- d = *dst;
-
- __m64 ms = unpack_32_1x64 (s);
-
- if (m != 0xff)
- {
- ms = inOver_1x64 (ms,
- xMask00ff,
- expandAlphaRev_1x64 (unpack_32_1x64 (m)),
- unpack_32_1x64 (d));
- }
-
- *dst++ = pack_1x64_32 (ms);
- w--;
- }
-
- /* call prefetch hint to optimize cache load*/
- cachePrefetch ((__m128i*)src);
- cachePrefetch ((__m128i*)dst);
- cachePrefetch ((__m128i*)mask);
-
- while (w >= 4)
- {
- /* fill cache line with next memory */
- cachePrefetchNext ((__m128i*)src);
- cachePrefetchNext ((__m128i*)dst);
- cachePrefetchNext ((__m128i*)mask);
-
- m = *(uint32_t*) mask;
- xmmSrc = _mm_or_si128 (load128Unaligned ((__m128i*)src), Maskff000000);
-
- if (m == 0xffffffff)
- {
- save128Aligned ((__m128i*)dst, xmmSrc);
- }
- else
- {
- xmmDst = load128Aligned ((__m128i*)dst);
-
- xmmMask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
-
- unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
- unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
- unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
-
- expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
- inOver_2x128 (xmmSrcLo, xmmSrcHi, Mask00ff, Mask00ff, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi);
-
- save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
- }
-
- src += 4;
- dst += 4;
- mask += 4;
- w -= 4;
- }
-
- while (w)
- {
- m = (uint32_t) *mask++;
-
- if (m)
- {
- s = 0xff000000 | *src;
-
- if (m == 0xff)
- {
- *dst = s;
- }
- else
- {
- d = *dst;
-
- *dst = pack_1x64_32 (inOver_1x64 (unpack_32_1x64 (s),
- xMask00ff,
- expandAlphaRev_1x64 (unpack_32_1x64 (m)),
- unpack_32_1x64 (d)));
- }
-
- }
-
- src++;
- dst++;
- w--;
- }
+ *dst++ = *(src++) << 24;
+ w--;
}
- _mm_empty();
+ return iter->buffer;
}
-#endif
-static const pixman_fast_path_t sse2_fast_paths[] =
+typedef struct
{
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, sse2_CompositeOver_n_8_0565, 0 },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, sse2_CompositeOver_n_8_0565, 0 },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_CompositeOver_n_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_CompositeOver_n_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_r5g6b5, sse2_CompositeOver_n_0565, 0 },
- { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_8888_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_8888_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_over_8888_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_over_8888_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_8888_0565, 0 },
- { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_over_8888_0565, 0 },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_CompositeOver_n_8_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_CompositeOver_n_8_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_CompositeOver_n_8_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_CompositeOver_n_8_8888, 0 },
-#if 0
- /* FIXME: This code are buggy in MMX version, now the bug was translated to SSE2 version */
- { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_CompositeOver_x888_8_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_CompositeOver_x888_8_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_CompositeOver_x888_8_8888, 0 },
- { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_CompositeOver_x888_8_8888, 0 },
-#endif
- { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_Composite_over_x888_n_8888, NEED_SOLID_MASK },
- { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_Composite_over_x888_n_8888, NEED_SOLID_MASK },
- { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_Composite_over_x888_n_8888, NEED_SOLID_MASK },
- { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_Composite_over_x888_n_8888, NEED_SOLID_MASK },
- { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
- { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
- { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
- { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_CompositeOver_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_CompositeOver_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_CompositeOver_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_CompositeOver_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_CompositeOver_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_CompositeOver_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
- { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_Composite_over_pixbuf_8888, NEED_PIXBUF },
- { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, sse2_Composite_over_pixbuf_8888, NEED_PIXBUF },
- { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_Composite_over_pixbuf_8888, NEED_PIXBUF },
- { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, sse2_Composite_over_pixbuf_8888, NEED_PIXBUF },
- { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, sse2_Composite_over_pixbuf_8888, NEED_PIXBUF },
- { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_Composite_over_pixbuf_8888, NEED_PIXBUF },
- { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, sse2_Composite_over_pixbuf_8888, NEED_PIXBUF },
- { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_Composite_over_pixbuf_8888, NEED_PIXBUF },
- { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_Composite_over_pixbuf_0565, NEED_PIXBUF },
- { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5, sse2_Composite_over_pixbuf_0565, NEED_PIXBUF },
- { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5, sse2_Composite_over_pixbuf_0565, NEED_PIXBUF },
- { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_Composite_over_pixbuf_0565, NEED_PIXBUF },
- { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_CompositeCopyArea, 0 },
- { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_CompositeCopyArea, 0 },
-
- { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_CompositeAdd_8000_8000, 0 },
- { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_CompositeAdd_8888_8888, 0 },
- { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_CompositeAdd_8888_8888, 0 },
- { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_CompositeAdd_8888_8_8, 0 },
-
- { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_CompositeSrc_n_8_8888, 0 },
- { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_CompositeSrc_n_8_8888, 0 },
- { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_CompositeSrc_n_8_8888, 0 },
- { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_CompositeSrc_n_8_8888, 0 },
- { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_CompositeCopyArea, 0 },
- { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_CompositeCopyArea, 0 },
- { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_CompositeCopyArea, 0 },
- { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_CompositeCopyArea, 0 },
- { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_CompositeCopyArea, 0 },
- { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_CompositeCopyArea, 0 },
- { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, sse2_CompositeCopyArea, 0 },
- { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, sse2_CompositeCopyArea, 0 },
-
- { PIXMAN_OP_IN, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_CompositeIn_8_8, 0 },
- { PIXMAN_OP_IN, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_CompositeIn_n_8_8, 0 },
+ pixman_format_code_t format;
+ pixman_iter_get_scanline_t get_scanline;
+} fetcher_info_t;
- { PIXMAN_OP_NONE },
+static const fetcher_info_t fetchers[] =
+{
+ { PIXMAN_x8r8g8b8, sse2_fetch_x8r8g8b8 },
+ { PIXMAN_r5g6b5, sse2_fetch_r5g6b5 },
+ { PIXMAN_a8, sse2_fetch_a8 },
+ { PIXMAN_null }
};
-/*
- * Work around GCC bug causing crashes in Mozilla with SSE2
- *
- * When using -msse, gcc generates movdqa instructions assuming that
- * the stack is 16 byte aligned. Unfortunately some applications, such
- * as Mozilla and Mono, end up aligning the stack to 4 bytes, which
- * causes the movdqa instructions to fail.
- *
- * The __force_align_arg_pointer__ makes gcc generate a prologue that
- * realigns the stack pointer to 16 bytes.
- *
- * On x86-64 this is not necessary because the standard ABI already
- * calls for a 16 byte aligned stack.
- *
- * See https://bugs.freedesktop.org/show_bug.cgi?id=15693
- */
-#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
-__attribute__((__force_align_arg_pointer__))
-#endif
static void
-sse2_composite (pixman_implementation_t *imp,
- pixman_op_t op,
- pixman_image_t *src,
- pixman_image_t *mask,
- pixman_image_t *dest,
- int32_t src_x,
- int32_t src_y,
- int32_t mask_x,
- int32_t mask_y,
- int32_t dest_x,
- int32_t dest_y,
- int32_t width,
- int32_t height)
-{
- if (_pixman_run_fast_path (sse2_fast_paths, imp,
- op, src, mask, dest,
- src_x, src_y,
- mask_x, mask_y,
- dest_x, dest_y,
- width, height))
+sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+ pixman_image_t *image = iter->image;
+ int x = iter->x;
+ int y = iter->y;
+ int width = iter->width;
+ int height = iter->height;
+
+#define FLAGS \
+ (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | FAST_PATH_BITS_IMAGE)
+
+ if ((iter->flags & ITER_NARROW) &&
+ (image->common.flags & FLAGS) == FLAGS &&
+ x >= 0 && y >= 0 &&
+ x + width <= image->bits.width &&
+ y + height <= image->bits.height)
{
- return;
+ const fetcher_info_t *f;
+
+ for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
+ {
+ if (image->common.extended_format_code == f->format)
+ {
+ uint8_t *b = (uint8_t *)image->bits.bits;
+ int s = image->bits.rowstride * 4;
+
+ iter->bits = b + s * iter->y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
+ iter->stride = s;
+
+ iter->get_scanline = f->get_scanline;
+ return;
+ }
+ }
}
- _pixman_implementation_composite (imp->delegate, op,
- src, mask, dest,
- src_x, src_y,
- mask_x, mask_y,
- dest_x, dest_y,
- width, height);
+ imp->delegate->src_iter_init (imp->delegate, iter);
}
#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
__attribute__((__force_align_arg_pointer__))
#endif
-static pixman_bool_t
-sse2_blt (pixman_implementation_t *imp,
- uint32_t *src_bits,
- uint32_t *dst_bits,
- int src_stride,
- int dst_stride,
- int src_bpp,
- int dst_bpp,
- int src_x, int src_y,
- int dst_x, int dst_y,
- int width, int height)
-{
- if (!pixmanBltsse2 (
- src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
- src_x, src_y, dst_x, dst_y, width, height))
-
- {
- return _pixman_implementation_blt (
- imp->delegate,
- src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
- src_x, src_y, dst_x, dst_y, width, height);
- }
-
- return TRUE;
-}
-
-#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
-__attribute__((__force_align_arg_pointer__))
-#endif
-static pixman_bool_t
-sse2_fill (pixman_implementation_t *imp,
- uint32_t *bits,
- int stride,
- int bpp,
- int x,
- int y,
- int width,
- int height,
- uint32_t xor)
-{
- if (!pixmanFillsse2 (bits, stride, bpp, x, y, width, height, xor))
- {
- return _pixman_implementation_fill (
- imp->delegate, bits, stride, bpp, x, y, width, height, xor);
- }
-
- return TRUE;
-}
-
pixman_implementation_t *
-_pixman_implementation_create_sse2 (void)
+_pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
{
- pixman_implementation_t *mmx = _pixman_implementation_create_mmx ();
- pixman_implementation_t *imp = _pixman_implementation_create (mmx);
+ pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
/* SSE2 constants */
- Mask565r = createMask_2x32_128 (0x00f80000, 0x00f80000);
- Mask565g1 = createMask_2x32_128 (0x00070000, 0x00070000);
- Mask565g2 = createMask_2x32_128 (0x000000e0, 0x000000e0);
- Mask565b = createMask_2x32_128 (0x0000001f, 0x0000001f);
- MaskRed = createMask_2x32_128 (0x00f80000, 0x00f80000);
- MaskGreen = createMask_2x32_128 (0x0000fc00, 0x0000fc00);
- MaskBlue = createMask_2x32_128 (0x000000f8, 0x000000f8);
- Mask565FixRB = createMask_2x32_128 (0x00e000e0, 0x00e000e0);
- Mask565FixG = createMask_2x32_128 (0x0000c000, 0x0000c000);
- Mask0080 = createMask_16_128 (0x0080);
- Mask00ff = createMask_16_128 (0x00ff);
- Mask0101 = createMask_16_128 (0x0101);
- Maskffff = createMask_16_128 (0xffff);
- Maskff000000 = createMask_2x32_128 (0xff000000, 0xff000000);
- MaskAlpha = createMask_2x32_128 (0x00ff0000, 0x00000000);
-
- /* MMX constants */
- xMask565rgb = createMask_2x32_64 (0x000001f0, 0x003f001f);
- xMask565Unpack = createMask_2x32_64 (0x00000084, 0x04100840);
-
- xMask0080 = createMask_16_64 (0x0080);
- xMask00ff = createMask_16_64 (0x00ff);
- xMask0101 = createMask_16_64 (0x0101);
- xMaskAlpha = createMask_2x32_64 (0x00ff0000, 0x00000000);
-
- _mm_empty();
+ mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
+ mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
+ mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
+ mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
+ mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
+ mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
+ mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
+ mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
+ mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
+ mask_0080 = create_mask_16_128 (0x0080);
+ mask_00ff = create_mask_16_128 (0x00ff);
+ mask_0101 = create_mask_16_128 (0x0101);
+ mask_ffff = create_mask_16_128 (0xffff);
+ mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
+ mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
/* Set up function pointers */
-
- /* SSE code patch for fbcompose.c */
- imp->combine_32[PIXMAN_OP_OVER] = sse2CombineOverU;
- imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2CombineOverReverseU;
- imp->combine_32[PIXMAN_OP_IN] = sse2CombineInU;
- imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2CombineInReverseU;
- imp->combine_32[PIXMAN_OP_OUT] = sse2CombineOutU;
- imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2CombineOutReverseU;
- imp->combine_32[PIXMAN_OP_ATOP] = sse2CombineAtopU;
- imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2CombineAtopReverseU;
- imp->combine_32[PIXMAN_OP_XOR] = sse2CombineXorU;
- imp->combine_32[PIXMAN_OP_ADD] = sse2CombineAddU;
-
- imp->combine_32[PIXMAN_OP_SATURATE] = sse2CombineSaturateU;
-
- imp->combine_32_ca[PIXMAN_OP_SRC] = sse2CombineSrcC;
- imp->combine_32_ca[PIXMAN_OP_OVER] = sse2CombineOverC;
- imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2CombineOverReverseC;
- imp->combine_32_ca[PIXMAN_OP_IN] = sse2CombineInC;
- imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2CombineInReverseC;
- imp->combine_32_ca[PIXMAN_OP_OUT] = sse2CombineOutC;
- imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2CombineOutReverseC;
- imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2CombineAtopC;
- imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2CombineAtopReverseC;
- imp->combine_32_ca[PIXMAN_OP_XOR] = sse2CombineXorC;
- imp->combine_32_ca[PIXMAN_OP_ADD] = sse2CombineAddC;
-
- imp->composite = sse2_composite;
+ imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
+ imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
+ imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
+ imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
+ imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
+ imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
+ imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
+ imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
+ imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
+ imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
+
+ imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
+
+ imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
+ imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
+ imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
+ imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
+ imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
+ imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
+ imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
+ imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
+
imp->blt = sse2_blt;
imp->fill = sse2_fill;
-
+
+ imp->src_iter_init = sse2_src_iter_init;
+
return imp;
}
-
-#endif /* USE_SSE2 */