From 92ef26dfed3337831dd5156bfe0d20b132a26a29 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Andr=C3=A9=20Tupinamb=C3=A1?= Date: Wed, 23 Apr 2008 00:18:39 -0400 Subject: [PATCH] Add SSE2 implementations of many compositing operations. --- configure.ac | 8 + pixman/pixman-pict.c | 86 +- pixman/pixman-sse.c | 4618 +++++++++++++++++++++++++++++++++++++++++++++++++- pixman/pixman-sse.h | 305 ++++ 4 files changed, 5003 insertions(+), 14 deletions(-) diff --git a/configure.ac b/configure.ac index 0f52b87..637f835 100644 --- a/configure.ac +++ b/configure.ac @@ -131,6 +131,10 @@ dnl Check for MMX MMX_CFLAGS="-mmmx -Winline" +if test "x$GCC" = "xyes"; then + MMX_CFLAGS="$MMX_CFLAGS --param inline-unit-growth=10000 --param large-function-growth=10000" +fi + have_mmx_intrinsics=no AC_MSG_CHECKING(whether to use MMX intrinsics) xserver_save_CFLAGS=$CFLAGS @@ -199,6 +203,10 @@ dnl Check for SSE2 SSE_CFLAGS="-mmmx -msse2 -Winline" +if test "x$GCC" = "xyes"; then + SSE_CFLAGS="$SSE_CFLAGS --param inline-unit-growth=10000 --param large-function-growth=10000 --param max-inline-insns-single=6000" +fi + have_sse2_intrinsics=no AC_MSG_CHECKING(whether to use SSE2 intrinsics) xserver_save_CFLAGS=$CFLAGS diff --git a/pixman/pixman-pict.c b/pixman/pixman-pict.c index f01a643..00ac511 100644 --- a/pixman/pixman-pict.c +++ b/pixman/pixman-pict.c @@ -1411,6 +1411,83 @@ static const FastPathInfo mmx_fast_paths[] = #ifdef USE_SSE2 static const FastPathInfo sse_fast_paths[] = { + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, fbCompositeSolidMask_nx8x0565sse2, 0 }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, fbCompositeSolidMask_nx8x0565sse2, 0 }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_a8r8g8b8, fbCompositeSolid_nx8888sse2, 0 }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_x8r8g8b8, fbCompositeSolid_nx8888sse2, 0 }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_r5g6b5, fbCompositeSolid_nx0565sse2, 0 }, + { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8888sse2, 0 }, + { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8888sse2, 0 }, + { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, fbCompositeSrc_8888x8888sse2, 0 }, + { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, fbCompositeSrc_8888x8888sse2, 0 }, + { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, fbCompositeSrc_8888x0565sse2, 0 }, + { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, fbCompositeSrc_8888x0565sse2, 0 }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, fbCompositeSolidMask_nx8x8888sse2, 0 }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8x8888sse2, 0 }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8x8888sse2, 0 }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, fbCompositeSolidMask_nx8x8888sse2, 0 }, +#if 0 + /* FIXME: This code are buggy in MMX version, now the bug was translated to SSE2 version */ + { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, fbCompositeOver_x888x8x8888sse2, 0 }, + { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, fbCompositeOver_x888x8x8888sse2, 0 }, + { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, fbCompositeOver_x888x8x8888sse2, 0 }, + { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8r8g8b8, fbCompositeOver_x888x8x8888sse2, 0 }, +#endif + { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, fbCompositeSrc_x888xnx8888sse2, NEED_SOLID_MASK }, + { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, fbCompositeSrc_x888xnx8888sse2, NEED_SOLID_MASK }, + { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, fbCompositeSrc_x888xnx8888sse2, NEED_SOLID_MASK }, + { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, fbCompositeSrc_x888xnx8888sse2, NEED_SOLID_MASK }, + { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8x8888sse2, NEED_SOLID_MASK }, + { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8x8888sse2, NEED_SOLID_MASK }, + { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, fbCompositeSrc_8888x8x8888sse2, NEED_SOLID_MASK }, + { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, fbCompositeSrc_8888x8x8888sse2, NEED_SOLID_MASK }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, fbCompositeSolidMask_nx8888x8888Csse2, NEED_COMPONENT_ALPHA }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8888x8888Csse2, NEED_COMPONENT_ALPHA }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8888x8888Csse2, NEED_COMPONENT_ALPHA }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, fbCompositeSolidMask_nx8888x8888Csse2, NEED_COMPONENT_ALPHA }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, fbCompositeSolidMask_nx8888x0565Csse2, NEED_COMPONENT_ALPHA }, + { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, fbCompositeSolidMask_nx8888x0565Csse2, NEED_COMPONENT_ALPHA }, + { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, fbCompositeSrc_8888RevNPx8888sse2, NEED_PIXBUF }, + { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, fbCompositeSrc_8888RevNPx8888sse2, NEED_PIXBUF }, + { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, fbCompositeSrc_8888RevNPx8888sse2, NEED_PIXBUF }, + { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, fbCompositeSrc_8888RevNPx8888sse2, NEED_PIXBUF }, + { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, fbCompositeSrc_8888RevNPx8888sse2, NEED_PIXBUF }, + { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, fbCompositeSrc_8888RevNPx8888sse2, NEED_PIXBUF }, + { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, fbCompositeSrc_8888RevNPx8888sse2, NEED_PIXBUF }, + { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, fbCompositeSrc_8888RevNPx8888sse2, NEED_PIXBUF }, + { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, fbCompositeSrc_8888RevNPx0565sse2, NEED_PIXBUF }, + { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5, fbCompositeSrc_8888RevNPx0565sse2, NEED_PIXBUF }, + { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5, fbCompositeSrc_8888RevNPx0565sse2, NEED_PIXBUF }, + { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, fbCompositeSrc_8888RevNPx0565sse2, NEED_PIXBUF }, +#if 0 + /* FIXME: This code is commented out since it's apparently not actually faster than the generic code */ + { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, fbCompositeCopyAreasse2, 0 }, + { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, fbCompositeCopyAreasse2, 0 }, +#endif + + { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, fbCompositeSrcAdd_8000x8000sse2, 0 }, + { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, fbCompositeSrcAdd_8888x8888sse2, 0 }, + { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, fbCompositeSrcAdd_8888x8888sse2, 0 }, + { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, fbCompositeSrcAdd_8888x8x8sse2, 0 }, + + { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, fbCompositeSolidMaskSrc_nx8x8888sse2, 0 }, + { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, fbCompositeSolidMaskSrc_nx8x8888sse2, 0 }, + { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, fbCompositeSolidMaskSrc_nx8x8888sse2, 0 }, + { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, fbCompositeSolidMaskSrc_nx8x8888sse2, 0 }, + +#if 0 + /* FIXME: This code is commented out since it's apparently not actually faster than the generic code */ + { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, fbCompositeCopyAreasse2, 0 }, + { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, fbCompositeCopyAreasse2, 0 }, + { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, fbCompositeCopyAreasse2, 0 }, + { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, fbCompositeCopyAreasse2, 0 }, + { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, fbCompositeCopyAreasse2, 0 }, + { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, fbCompositeCopyAreasse2, 0 }, +#endif + + { PIXMAN_OP_IN, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, fbCompositeIn_8x8sse2, 0 }, + { PIXMAN_OP_IN, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, fbCompositeIn_nx8x8sse2, 0 }, + { PIXMAN_OP_NONE }, }; #endif @@ -1591,7 +1668,6 @@ static const OptimizedOperatorInfo optimized_operators[] = { PIXMAN_OP_NONE } }; - /* * Check if the current operator could be optimized */ @@ -1663,14 +1739,14 @@ pixman_image_composite (pixman_op_t op, pixman_bool_t dstAlphaMap = pDst->common.alpha_map != NULL; CompositeFunc func = NULL; -#ifdef USE_SSE2 - fbComposeSetupSSE(); -#endif - #ifdef USE_MMX fbComposeSetupMMX(); #endif +#ifdef USE_SSE2 + fbComposeSetupSSE(); +#endif + if (srcRepeat && srcTransform && pSrc->bits.width == 1 && pSrc->bits.height == 1) diff --git a/pixman/pixman-sse.c b/pixman/pixman-sse.c index eb1d3d5..ca16515 100644 --- a/pixman/pixman-sse.c +++ b/pixman/pixman-sse.c @@ -1,5 +1,6 @@ /* * Copyright © 2008 Rodrigo Kumpera + * Copyright © 2008 André Tupinambá * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that @@ -21,31 +22,4630 @@ * SOFTWARE. * * Author: Rodrigo Kumpera (kumpera@gmail.com) + * André Tupinambá (andrelrt@gmail.com) * + * Based on work by Owen Taylor and Søren Sandmann */ #ifdef HAVE_CONFIG_H #include #endif +#include +#include /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ +#include /* for SSE2 intrinsics */ + #include "pixman-sse.h" #ifdef USE_SSE2 -void -fbComposeSetupSSE(void) +#ifdef _MSC_VER +#undef inline +#define inline __forceinline +#endif + +/* ------------------------------------------------------------------------------------------------- + * Locals + */ + +static __m64 xMask0080; +static __m64 xMask00ff; +static __m64 xMask0101; +static __m64 xMaskAlpha; + +static __m64 xMask565rgb; +static __m64 xMask565Unpack; + +static __m128i Mask0080; +static __m128i Mask00ff; +static __m128i Mask0101; +static __m128i Maskffff; +static __m128i Maskff000000; +static __m128i MaskAlpha; + +static __m128i Mask565r; +static __m128i Mask565g1, Mask565g2; +static __m128i Mask565b; +static __m128i MaskRed; +static __m128i MaskGreen; +static __m128i MaskBlue; + +/* ------------------------------------------------------------------------------------------------- + * SSE2 Inlines + */ +static inline __m128i +unpack_32_1x128 (uint32_t data) { - static pixman_bool_t initialized = FALSE; + return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128()); +} - if (initialized) - return; - - /* check if we have SSE2 support and initialize accordingly */ - if (pixman_have_sse()) +static inline void +unpack_128_2x128 (__m128i data, __m128i* dataLo, __m128i* dataHi) +{ + *dataLo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ()); + *dataHi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ()); +} + +static inline void +unpack565_128_4x128 (__m128i data, __m128i* data0, __m128i* data1, __m128i* data2, __m128i* data3) +{ + __m128i lo, hi; + __m128i r, g, b; + + lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ()); + hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ()); + + r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), MaskRed); + g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), MaskGreen); + b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), MaskBlue); + + lo = _mm_or_si128 (_mm_or_si128 (r, g), b); + + r = _mm_and_si128 (_mm_slli_epi32 (hi, 8), MaskRed); + g = _mm_and_si128 (_mm_slli_epi32 (hi, 5), MaskGreen); + b = _mm_and_si128 (_mm_slli_epi32 (hi, 3), MaskBlue); + + hi = _mm_or_si128 (_mm_or_si128 (r, g), b); + + unpack_128_2x128 (lo, data0, data1); + unpack_128_2x128 (hi, data2, data3); +} + +static inline uint16_t +pack565_32_16 (uint32_t pixel) +{ + return (uint16_t) (((pixel>>8) & 0xf800) | ((pixel>>5) & 0x07e0) | ((pixel>>3) & 0x001f)); +} + +static inline __m128i +pack_2x128_128 (__m128i lo, __m128i hi) +{ + return _mm_packus_epi16 (lo, hi); +} + +static inline __m128i +pack565_2x128_128 (__m128i lo, __m128i hi) +{ + __m128i data; + __m128i r, g1, g2, b; + + data = pack_2x128_128 ( lo, hi ); + + r = _mm_and_si128 (data , Mask565r); + g1 = _mm_and_si128 (_mm_slli_epi32 (data , 3), Mask565g1); + g2 = _mm_and_si128 (_mm_srli_epi32 (data , 5), Mask565g2); + b = _mm_and_si128 (_mm_srli_epi32 (data , 3), Mask565b); + + return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b); +} + +static inline __m128i +pack565_4x128_128 (__m128i xmm0, __m128i xmm1, __m128i xmm2, __m128i xmm3) +{ + __m128i lo, hi; + + lo = _mm_packus_epi16 (pack565_2x128_128 ( xmm0, xmm1 ), _mm_setzero_si128 ()); + hi = _mm_packus_epi16 (_mm_setzero_si128 (), pack565_2x128_128 ( xmm2, xmm3 )); + + return _mm_or_si128 (lo, hi); +} + +static inline uint32_t +packAlpha (__m128i x) +{ + return _mm_cvtsi128_si32 (_mm_packus_epi16 (_mm_packus_epi16 (_mm_srli_epi32 (x, 24), + _mm_setzero_si128 ()), + _mm_setzero_si128 ())); +} + +static inline __m128i +expandPixel_32_1x128 (uint32_t data) +{ + return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE(1, 0, 1, 0)); +} + +static inline __m128i +expandAlpha_1x128 (__m128i data) +{ + return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); +} + +static inline void +expandAlpha_2x128 (__m128i dataLo, __m128i dataHi, __m128i* alphaLo, __m128i* alphaHi) +{ + __m128i lo, hi; + + lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 3, 3, 3)); + hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 3, 3, 3)); + *alphaLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 3, 3, 3)); + *alphaHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 3, 3, 3)); +} + +static inline void +expandAlphaRev_2x128 (__m128i dataLo, __m128i dataHi, __m128i* alphaLo, __m128i* alphaHi) +{ + __m128i lo, hi; + + lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(0, 0, 0, 0)); + hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(0, 0, 0, 0)); + *alphaLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(0, 0, 0, 0)); + *alphaHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(0, 0, 0, 0)); +} + +static inline void +pixMultiply_2x128 (__m128i dataLo, __m128i dataHi, __m128i alphaLo, __m128i alphaHi, __m128i* retLo, __m128i* retHi) +{ + __m128i lo, hi; + + lo = _mm_mullo_epi16 (dataLo, alphaLo); + hi = _mm_mullo_epi16 (dataHi, alphaHi); + lo = _mm_adds_epu16 (lo, Mask0080); + hi = _mm_adds_epu16 (hi, Mask0080); + *retLo = _mm_mulhi_epu16 (lo, Mask0101); + *retHi = _mm_mulhi_epu16 (hi, Mask0101); +} + +static inline void +pixAddMultiply_2x128 (__m128i srcLo, __m128i srcHi, __m128i alphaDstLo, __m128i alphaDstHi, + __m128i dstLo, __m128i dstHi, __m128i alphaSrcLo, __m128i alphaSrcHi, + __m128i* retLo, __m128i* retHi) +{ + __m128i lo, hi; + __m128i mulLo, mulHi; + + lo = _mm_mullo_epi16 (srcLo, alphaDstLo); + hi = _mm_mullo_epi16 (srcHi, alphaDstHi); + mulLo = _mm_mullo_epi16 (dstLo, alphaSrcLo); + mulHi = _mm_mullo_epi16 (dstHi, alphaSrcHi); + lo = _mm_adds_epu16 (lo, Mask0080); + hi = _mm_adds_epu16 (hi, Mask0080); + lo = _mm_adds_epu16 (lo, mulLo); + hi = _mm_adds_epu16 (hi, mulHi); + *retLo = _mm_mulhi_epu16 (lo, Mask0101); + *retHi = _mm_mulhi_epu16 (hi, Mask0101); +} + +static inline void +negate_2x128 (__m128i dataLo, __m128i dataHi, __m128i* negLo, __m128i* negHi) +{ + *negLo = _mm_xor_si128 (dataLo, Mask00ff); + *negHi = _mm_xor_si128 (dataHi, Mask00ff); +} + +static inline void +invertColors_2x128 (__m128i dataLo, __m128i dataHi, __m128i* invLo, __m128i* invHi) +{ + __m128i lo, hi; + + lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 0, 1, 2)); + hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 0, 1, 2)); + *invLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 0, 1, 2)); + *invHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 0, 1, 2)); +} + +static inline void +over_2x128 (__m128i srcLo, __m128i srcHi, __m128i alphaLo, __m128i alphaHi, __m128i* dstLo, __m128i* dstHi) +{ + negate_2x128 (alphaLo, alphaHi, &alphaLo, &alphaHi); + + pixMultiply_2x128 (*dstLo, *dstHi, alphaLo, alphaHi, dstLo, dstHi); + + *dstLo = _mm_adds_epu8 (srcLo, *dstLo); + *dstHi = _mm_adds_epu8 (srcHi, *dstHi); +} + +static inline void +overRevNonPre_2x128 (__m128i srcLo, __m128i srcHi, __m128i* dstLo, __m128i* dstHi) +{ + __m128i lo, hi; + __m128i alphaLo, alphaHi; + + expandAlpha_2x128 (srcLo, srcHi, &alphaLo, &alphaHi); + + lo = _mm_or_si128 (alphaLo, MaskAlpha); + hi = _mm_or_si128 (alphaHi, MaskAlpha); + + invertColors_2x128 (srcLo, srcHi, &srcLo, &srcHi); + + pixMultiply_2x128 (srcLo, srcHi, lo, hi, &lo, &hi); + + over_2x128 (lo, hi, alphaLo, alphaHi, dstLo, dstHi); +} + +static inline void +inOver_2x128 (__m128i srcLo, __m128i srcHi, __m128i alphaLo, __m128i alphaHi, + __m128i maskLo, __m128i maskHi, __m128i* dstLo, __m128i* dstHi) +{ + __m128i sLo, sHi; + __m128i aLo, aHi; + + pixMultiply_2x128 ( srcLo, srcHi, maskLo, maskHi, &sLo, &sHi); + pixMultiply_2x128 (alphaLo, alphaHi, maskLo, maskHi, &aLo, &aHi); + + over_2x128 (sLo, sHi, aLo, aHi, dstLo, dstHi); +} + +static inline void +cachePrefetch (__m128i* addr) +{ + _mm_prefetch (addr, _MM_HINT_T0); +} + +static inline void +cachePrefetchNext (__m128i* addr) +{ + _mm_prefetch (addr + 4, _MM_HINT_T0); // 64 bytes ahead +} + +/* load 4 pixels from a 16-byte boundary aligned address */ +static inline __m128i +load128Aligned (__m128i* src) +{ + return _mm_load_si128 (src); +} + +/* load 4 pixels from a unaligned address */ +static inline __m128i +load128Unaligned (__m128i* src) +{ + return _mm_loadu_si128 (src); +} + +/* save 4 pixels using Write Combining memory on a 16-byte boundary aligned address */ +static inline void +save128WriteCombining (__m128i* dst, __m128i data) +{ + _mm_stream_si128 (dst, data); +} + +/* save 4 pixels on a 16-byte boundary aligned address */ +static inline void +save128Aligned (__m128i* dst, __m128i data) +{ + _mm_store_si128 (dst, data); +} + +/* save 4 pixels on a unaligned address */ +static inline void +save128Unaligned (__m128i* dst, __m128i data) +{ + _mm_storeu_si128 (dst, data); +} + +/* ------------------------------------------------------------------------------------------------- + * MMX inlines + */ + +static inline __m64 +unpack_32_1x64 (uint32_t data) +{ + return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64()); +} + +static inline __m64 +expandAlpha_1x64 (__m64 data) +{ + return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 3, 3, 3)); +} + +static inline __m64 +expandAlphaRev_1x64 (__m64 data) +{ + return _mm_shuffle_pi16 (data, _MM_SHUFFLE(0, 0, 0, 0)); +} + +static inline __m64 +expandPixel_8_1x64 (uint8_t data) +{ + return _mm_shuffle_pi16 (unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE(0, 0, 0, 0)); +} + +static inline __m64 +pixMultiply_1x64 (__m64 data, __m64 alpha) +{ + return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha), + xMask0080), + xMask0101); +} + +static inline __m64 +pixAddMultiply_1x64 (__m64 src, __m64 alphaDst, __m64 dst, __m64 alphaSrc) +{ + return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (src, alphaDst), + xMask0080), + _mm_mullo_pi16 (dst, alphaSrc)), + xMask0101); +} + +static inline __m64 +negate_1x64 (__m64 data) +{ + return _mm_xor_si64 (data, xMask00ff); +} + +static inline __m64 +invertColors_1x64 (__m64 data) +{ + return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 0, 1, 2)); +} + +static inline __m64 +over_1x64 (__m64 src, __m64 alpha, __m64 dst) +{ + return _mm_adds_pu8 (src, pixMultiply_1x64 (dst, negate_1x64 (alpha))); +} + +static inline __m64 +inOver_1x64 (__m64 src, __m64 alpha, __m64 mask, __m64 dst) +{ + return over_1x64 (pixMultiply_1x64 (src, mask), + pixMultiply_1x64 (alpha, mask), + dst); +} + +static inline __m64 +overRevNonPre_1x64 (__m64 src, __m64 dst) +{ + __m64 alpha = expandAlpha_1x64 (src); + + return over_1x64 (pixMultiply_1x64 (invertColors_1x64 (src), + _mm_or_si64 (alpha, xMaskAlpha)), + alpha, + dst); +} + +static inline uint32_t +pack_1x64_32( __m64 data ) +{ + return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64())); +} + +/* Expand 16 bits positioned at @pos (0-3) of a mmx register into + * + * 00RR00GG00BB + * + * --- Expanding 565 in the low word --- + * + * m = (m << (32 - 3)) | (m << (16 - 5)) | m; + * m = m & (01f0003f001f); + * m = m * (008404100840); + * m = m >> 8; + * + * Note the trick here - the top word is shifted by another nibble to + * avoid it bumping into the middle word + */ +static inline __m64 +expand565_16_1x64 (uint16_t pixel) +{ + __m64 p; + __m64 t1, t2; + + p = _mm_cvtsi32_si64 ((uint32_t) pixel); + + t1 = _mm_slli_si64 (p, 36 - 11); + t2 = _mm_slli_si64 (p, 16 - 5); + + p = _mm_or_si64 (t1, p); + p = _mm_or_si64 (t2, p); + p = _mm_and_si64 (p, xMask565rgb); + p = _mm_mullo_pi16 (p, xMask565Unpack); + + return _mm_srli_pi16 (p, 8); +} + +/* ------------------------------------------------------------------------------------------------- + * Compose Core transformations + */ +static inline uint32_t +coreCombineOverUPixelsse2 (uint32_t src, uint32_t dst) +{ + uint8_t a; + __m64 ms; + + a = src >> 24; + + if (a == 0xff) + { + return src; + } + else if (a) { + ms = unpack_32_1x64 (src); + return pack_1x64_32 (over_1x64 (ms, expandAlpha_1x64 (ms), unpack_32_1x64 (dst))); } - initialized = TRUE; + return dst; +} + +static inline void +coreCombineOverUsse2 (uint32_t* pd, const uint32_t* ps, int w) +{ + uint32_t pa; + uint32_t s, d; + + __m128i xmmDstLo, xmmDstHi; + __m128i xmmSrcLo, xmmSrcHi; + __m128i xmmAlphaLo, xmmAlphaHi; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + + /* Align dst on a 16-byte boundary */ + while (w && + ((unsigned long)pd & 15)) + { + d = *pd; + s = *ps++; + + *pd++ = coreCombineOverUPixelsse2 (s, d); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)ps); + cachePrefetchNext ((__m128i*)pd); + + /* I'm loading unaligned because I'm not sure about the address alignment. */ + xmmSrcHi = load128Unaligned ((__m128i*) ps); + + /* Check the alpha channel */ + pa = packAlpha (xmmSrcHi); + + if (pa == 0xffffffff) + { + save128Aligned ((__m128i*)pd, xmmSrcHi); + } + else if (pa) + { + xmmDstHi = load128Aligned ((__m128i*) pd); + + unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); + unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); + + expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi); + + over_2x128 (xmmSrcLo, xmmSrcHi, xmmAlphaLo, xmmAlphaHi, &xmmDstLo, &xmmDstHi); + + /* rebuid the 4 pixel data and save*/ + save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); + } + + w -= 4; + ps += 4; + pd += 4; + } + + while (w) + { + d = *pd; + s = *ps++; + + *pd++ = coreCombineOverUPixelsse2 (s, d); + w--; + } +} + +static inline void +coreCombineOverReverseUsse2 (uint32_t* pd, const uint32_t* ps, int w) +{ + uint32_t s, d; + + __m128i xmmDstLo, xmmDstHi; + __m128i xmmSrcLo, xmmSrcHi; + __m128i xmmAlphaLo, xmmAlphaHi; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + + /* Align dst on a 16-byte boundary */ + while (w && + ((unsigned long)pd & 15)) + { + d = *pd; + s = *ps++; + + *pd++ = coreCombineOverUPixelsse2 (d, s); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)ps); + cachePrefetchNext ((__m128i*)pd); + + /* I'm loading unaligned because I'm not sure about the address alignment. */ + xmmSrcHi = load128Unaligned ((__m128i*) ps); + xmmDstHi = load128Aligned ((__m128i*) pd); + + unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); + unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); + + expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi); + + over_2x128 (xmmDstLo, xmmDstHi, xmmAlphaLo, xmmAlphaHi, &xmmSrcLo, &xmmSrcHi); + + /* rebuid the 4 pixel data and save*/ + save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmSrcLo, xmmSrcHi)); + + w -= 4; + ps += 4; + pd += 4; + } + + while (w) + { + d = *pd; + s = *ps++; + + *pd++ = coreCombineOverUPixelsse2 (d, s); + w--; + } +} + +static inline uint32_t +coreCombineInUPixelsse2 (uint32_t src, uint32_t dst) +{ + uint32_t maska = src >> 24; + + if (maska == 0) + { + return 0; + } + else if (maska != 0xff) + { + return pack_1x64_32(pixMultiply_1x64 (unpack_32_1x64 (dst), expandAlpha_1x64 (unpack_32_1x64 (src)))); + } + + return dst; +} + +static inline void +coreCombineInUsse2 (uint32_t* pd, const uint32_t* ps, int w) +{ + uint32_t s, d; + + __m128i xmmSrcLo, xmmSrcHi; + __m128i xmmDstLo, xmmDstHi; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + + while (w && ((uint32_t) pd & 15)) + { + s = *ps++; + d = *pd; + + *pd++ = coreCombineInUPixelsse2 (d, s); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)ps); + cachePrefetchNext ((__m128i*)pd); + + xmmDstHi = load128Aligned ((__m128i*) pd); + xmmSrcHi = load128Unaligned ((__m128i*) ps); + + unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); + expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi); + + unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); + pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi); + + save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); + + ps += 4; + pd += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + d = *pd; + + *pd++ = coreCombineInUPixelsse2 (d, s); + w--; + } } +static inline void +coreCombineReverseInUsse2 (uint32_t* pd, const uint32_t* ps, int w) +{ + uint32_t s, d; + + __m128i xmmSrcLo, xmmSrcHi; + __m128i xmmDstLo, xmmDstHi; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + + while (w && ((uint32_t) pd & 15)) + { + s = *ps++; + d = *pd; + + *pd++ = coreCombineInUPixelsse2 (s, d); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)ps); + cachePrefetchNext ((__m128i*)pd); + + xmmDstHi = load128Aligned ((__m128i*) pd); + xmmSrcHi = load128Unaligned ((__m128i*) ps); + + unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); + expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi); + + unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); + pixMultiply_2x128 (xmmDstLo, xmmDstHi, xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi); + + save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); + + ps += 4; + pd += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + d = *pd; + + *pd++ = coreCombineInUPixelsse2 (s, d); + w--; + } +} + +static inline void +coreCombineReverseOutUsse2 (uint32_t* pd, const uint32_t* ps, int w) +{ + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + + while (w && ((uint32_t) pd & 15)) + { + uint32_t s = *ps++; + uint32_t d = *pd; + + *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (s))))); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + + while (w >= 4) + { + __m128i xmmSrcLo, xmmSrcHi; + __m128i xmmDstLo, xmmDstHi; + + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)ps); + cachePrefetchNext ((__m128i*)pd); + + xmmSrcHi = load128Unaligned ((__m128i*) ps); + xmmDstHi = load128Aligned ((__m128i*) pd); + + unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); + unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); + + expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi); + negate_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi); + + pixMultiply_2x128 (xmmDstLo, xmmDstHi, xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi); + + save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); + + ps += 4; + pd += 4; + w -= 4; + } + + while (w) + { + uint32_t s = *ps++; + uint32_t d = *pd; + + *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (s))))); + w--; + } +} + +static inline void +coreCombineOutUsse2 (uint32_t* pd, const uint32_t* ps, int w) +{ + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + + while (w && ((uint32_t) pd & 15)) + { + uint32_t s = *ps++; + uint32_t d = *pd; + + *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d))))); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + + while (w >= 4) + { + __m128i xmmSrcLo, xmmSrcHi; + __m128i xmmDstLo, xmmDstHi; + + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)ps); + cachePrefetchNext ((__m128i*)pd); + + xmmSrcHi = load128Unaligned ((__m128i*) ps); + xmmDstHi = load128Aligned ((__m128i*) pd); + + unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); + unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); + + expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi); + negate_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi); + + pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi); + + save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); + + ps += 4; + pd += 4; + w -= 4; + } + + while (w) + { + uint32_t s = *ps++; + uint32_t d = *pd; + + *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d))))); + w--; + } +} + +static inline uint32_t +coreCombineAtopUPixelsse2 (uint32_t src, uint32_t dst) +{ + __m64 s = unpack_32_1x64 (src); + __m64 d = unpack_32_1x64 (dst); + + __m64 sa = negate_1x64 (expandAlpha_1x64 (s)); + __m64 da = expandAlpha_1x64 (d); + + return pack_1x64_32 (pixAddMultiply_1x64 (s, da, d, sa)); +} + +static inline void +coreCombineAtopUsse2 (uint32_t* pd, const uint32_t* ps, int w) +{ + uint32_t s, d; + + __m128i xmmSrcLo, xmmSrcHi; + __m128i xmmDstLo, xmmDstHi; + __m128i xmmAlphaSrcLo, xmmAlphaSrcHi; + __m128i xmmAlphaDstLo, xmmAlphaDstHi; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + + while (w && ((uint32_t) pd & 15)) + { + s = *ps++; + d = *pd; + + *pd++ = coreCombineAtopUPixelsse2 (s, d); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)ps); + cachePrefetchNext ((__m128i*)pd); + + xmmSrcHi = load128Unaligned ((__m128i*) ps); + xmmDstHi = load128Aligned ((__m128i*) pd); + + unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); + unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); + + expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi); + expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); + + negate_2x128 (xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi); + + pixAddMultiply_2x128 ( xmmSrcLo, xmmSrcHi, xmmAlphaDstLo, xmmAlphaDstHi, + xmmDstLo, xmmDstHi, xmmAlphaSrcLo, xmmAlphaSrcHi, + &xmmDstLo, &xmmDstHi ); + + save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); + + ps += 4; + pd += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + d = *pd; + + *pd++ = coreCombineAtopUPixelsse2 (s, d); + w--; + } +} + +static inline uint32_t +coreCombineReverseAtopUPixelsse2 (uint32_t src, uint32_t dst) +{ + __m64 s = unpack_32_1x64 (src); + __m64 d = unpack_32_1x64 (dst); + + __m64 sa = expandAlpha_1x64 (s); + __m64 da = negate_1x64 (expandAlpha_1x64 (d)); + + return pack_1x64_32 (pixAddMultiply_1x64 (s, da, d, sa)); +} + +static inline void +coreCombineReverseAtopUsse2 (uint32_t* pd, const uint32_t* ps, int w) +{ + uint32_t s, d; + + __m128i xmmSrcLo, xmmSrcHi; + __m128i xmmDstLo, xmmDstHi; + __m128i xmmAlphaSrcLo, xmmAlphaSrcHi; + __m128i xmmAlphaDstLo, xmmAlphaDstHi; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + + while (w && ((uint32_t) pd & 15)) + { + s = *ps++; + d = *pd; + + *pd++ = coreCombineReverseAtopUPixelsse2 (s, d); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)ps); + cachePrefetchNext ((__m128i*)pd); + + xmmSrcHi = load128Unaligned ((__m128i*) ps); + xmmDstHi = load128Aligned ((__m128i*) pd); + + unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); + unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); + + expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi); + expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); + + negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); + + pixAddMultiply_2x128 ( xmmSrcLo, xmmSrcHi, xmmAlphaDstLo, xmmAlphaDstHi, + xmmDstLo, xmmDstHi, xmmAlphaSrcLo, xmmAlphaSrcHi, + &xmmDstLo, &xmmDstHi ); + + save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); + + ps += 4; + pd += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + d = *pd; + + *pd++ = coreCombineReverseAtopUPixelsse2 (s, d); + w--; + } +} + +static inline uint32_t +coreCombineXorUPixelsse2 (uint32_t src, uint32_t dst) +{ + __m64 s = unpack_32_1x64 (src); + __m64 d = unpack_32_1x64 (dst); + + return pack_1x64_32 (pixAddMultiply_1x64 (s, negate_1x64 (expandAlpha_1x64 (d)), d, negate_1x64 (expandAlpha_1x64 (s)))); +} + +static inline void +coreCombineXorUsse2 (uint32_t* dst, const uint32_t* src, int width) +{ + int w = width; + uint32_t s, d; + uint32_t* pd = dst; + const uint32_t* ps = src; + + __m128i xmmSrc, xmmSrcLo, xmmSrcHi; + __m128i xmmDst, xmmDstLo, xmmDstHi; + __m128i xmmAlphaSrcLo, xmmAlphaSrcHi; + __m128i xmmAlphaDstLo, xmmAlphaDstHi; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + + while (w && ((uint32_t) pd & 15)) + { + s = *ps++; + d = *pd; + + *pd++ = coreCombineXorUPixelsse2 (s, d); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)ps); + cachePrefetchNext ((__m128i*)pd); + + xmmSrc = load128Unaligned ((__m128i*) ps); + xmmDst = load128Aligned ((__m128i*) pd); + + unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); + unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); + + expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi); + expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); + + negate_2x128 (xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi); + negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); + + pixAddMultiply_2x128 ( xmmSrcLo, xmmSrcHi, xmmAlphaDstLo, xmmAlphaDstHi, + xmmDstLo, xmmDstHi, xmmAlphaSrcLo, xmmAlphaSrcHi, + &xmmDstLo, &xmmDstHi ); + + save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); + + ps += 4; + pd += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + d = *pd; + + *pd++ = coreCombineXorUPixelsse2 (s, d); + w--; + } +} + +static inline void +coreCombineAddUsse2 (uint32_t* dst, const uint32_t* src, int width) +{ + int w = width; + uint32_t s,d; + uint32_t* pd = dst; + const uint32_t* ps = src; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + d = *pd; + *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d))); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)ps); + cachePrefetchNext ((__m128i*)pd); + + save128Aligned( (__m128i*)pd, + _mm_adds_epu8( load128Unaligned((__m128i*)ps), + load128Aligned ((__m128i*)pd)) ); + pd += 4; + ps += 4; + w -= 4; + } + + while (w--) + { + s = *ps++; + d = *pd; + *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d))); + } +} + +static inline uint32_t +coreCombineSaturateUPixelsse2 (uint32_t src, uint32_t dst) +{ + __m64 ms = unpack_32_1x64 (src); + __m64 md = unpack_32_1x64 (dst); + uint32_t sa = src >> 24; + uint32_t da = ~dst >> 24; + + if (sa > da) + { + ms = pixMultiply_1x64 (ms, expandAlpha_1x64 (unpack_32_1x64 (FbIntDiv(da, sa) << 24))); + } + + return pack_1x64_32 (_mm_adds_pu16 (md, ms)); +} + +static inline void +coreCombineSaturateUsse2 (uint32_t *pd, const uint32_t *ps, int w) +{ + uint32_t s,d; + + uint32_t packCmp; + __m128i xmmSrc, xmmDst; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + d = *pd; + *pd++ = coreCombineSaturateUPixelsse2 (s, d); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)ps); + cachePrefetchNext ((__m128i*)pd); + + xmmDst = load128Aligned ((__m128i*)pd); + xmmSrc = load128Unaligned((__m128i*)ps); + + packCmp = _mm_movemask_epi8 (_mm_cmpgt_epi32 (_mm_srli_epi32 (xmmSrc, 24), + _mm_srli_epi32 (_mm_xor_si128 (xmmDst, Maskff000000), 24))); + + /* if some alpha src is grater than respective ~alpha dst */ + if (packCmp) + { + s = *ps++; + d = *pd; + *pd++ = coreCombineSaturateUPixelsse2 (s, d); + + s = *ps++; + d = *pd; + *pd++ = coreCombineSaturateUPixelsse2 (s, d); + + s = *ps++; + d = *pd; + *pd++ = coreCombineSaturateUPixelsse2 (s, d); + + s = *ps++; + d = *pd; + *pd++ = coreCombineSaturateUPixelsse2 (s, d); + } + else + { + save128Aligned ((__m128i*)pd, _mm_adds_epu8 (xmmDst, xmmSrc)); + + pd += 4; + ps += 4; + } + + w -= 4; + } + + while (w--) + { + s = *ps++; + d = *pd; + *pd++ = coreCombineSaturateUPixelsse2 (s, d); + } +} + +static inline void +coreCombineSrcCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w) +{ + uint32_t s, m; + + __m128i xmmSrcLo, xmmSrcHi; + __m128i xmmMaskLo, xmmMaskHi; + __m128i xmmDstLo, xmmDstHi; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + cachePrefetch ((__m128i*)pm); + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m))); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + cachePrefetch ((__m128i*)pm); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)ps); + cachePrefetchNext ((__m128i*)pd); + cachePrefetchNext ((__m128i*)pm); + + xmmSrcHi = load128Unaligned ((__m128i*)ps); + xmmMaskHi = load128Unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); + unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + + pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi); + + save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m))); + w--; + } +} + +static inline uint32_t +coreCombineOverCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst) +{ + __m64 s = unpack_32_1x64 (src); + + return pack_1x64_32 (inOver_1x64 (s, expandAlpha_1x64 (s), unpack_32_1x64 (mask), unpack_32_1x64 (dst))); +} + +static inline void +coreCombineOverCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w) +{ + uint32_t s, m, d; + + __m128i xmmAlphaLo, xmmAlphaHi; + __m128i xmmSrcLo, xmmSrcHi; + __m128i xmmDstLo, xmmDstHi; + __m128i xmmMaskLo, xmmMaskHi; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + cachePrefetch ((__m128i*)pm); + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = coreCombineOverCPixelsse2 (s, m, d); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + cachePrefetch ((__m128i*)pm); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)ps); + cachePrefetchNext ((__m128i*)pd); + cachePrefetchNext ((__m128i*)pm); + + xmmDstHi = load128Aligned ((__m128i*)pd); + xmmSrcHi = load128Unaligned ((__m128i*)ps); + xmmMaskHi = load128Unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); + unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); + unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + + expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi); + + inOver_2x128 (xmmSrcLo, xmmSrcHi, xmmAlphaLo, xmmAlphaHi, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi); + + save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = coreCombineOverCPixelsse2 (s, m, d); + w--; + } +} + +static inline uint32_t +coreCombineOverReverseCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst) +{ + __m64 d = unpack_32_1x64 (dst); + + return pack_1x64_32(over_1x64 (d, expandAlpha_1x64 (d), pixMultiply_1x64 (unpack_32_1x64 (src), unpack_32_1x64 (mask)))); +} + +static inline void +coreCombineOverReverseCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w) +{ + uint32_t s, m, d; + + __m128i xmmAlphaLo, xmmAlphaHi; + __m128i xmmSrcLo, xmmSrcHi; + __m128i xmmDstLo, xmmDstHi; + __m128i xmmMaskLo, xmmMaskHi; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + cachePrefetch ((__m128i*)pm); + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = coreCombineOverReverseCPixelsse2 (s, m, d); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + cachePrefetch ((__m128i*)pm); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)ps); + cachePrefetchNext ((__m128i*)pd); + cachePrefetchNext ((__m128i*)pm); + + xmmDstHi = load128Aligned ((__m128i*)pd); + xmmSrcHi = load128Unaligned ((__m128i*)ps); + xmmMaskHi = load128Unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); + unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); + unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + + expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi); + pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + + over_2x128 (xmmDstLo, xmmDstHi, xmmAlphaLo, xmmAlphaHi, &xmmMaskLo, &xmmMaskHi); + + save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmMaskLo, xmmMaskHi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = coreCombineOverReverseCPixelsse2 (s, m, d); + w--; + } +} + +static inline void +coreCombineInCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w) +{ + uint32_t s, m, d; + + __m128i xmmAlphaLo, xmmAlphaHi; + __m128i xmmSrcLo, xmmSrcHi; + __m128i xmmDstLo, xmmDstHi; + __m128i xmmMaskLo, xmmMaskHi; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + cachePrefetch ((__m128i*)pm); + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)), + expandAlpha_1x64 (unpack_32_1x64 (d)))); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + cachePrefetch ((__m128i*)pm); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)ps); + cachePrefetchNext ((__m128i*)pd); + cachePrefetchNext ((__m128i*)pm); + + xmmDstHi = load128Aligned ((__m128i*)pd); + xmmSrcHi = load128Unaligned ((__m128i*)ps); + xmmMaskHi = load128Unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); + unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); + unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + + expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi); + pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi); + + pixMultiply_2x128 (xmmDstLo, xmmDstHi, xmmAlphaLo, xmmAlphaHi, &xmmDstLo, &xmmDstHi); + + save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)), + expandAlpha_1x64 (unpack_32_1x64 (d)))); + w--; + } +} + +static inline void +coreCombineInReverseCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w) +{ + uint32_t s, m, d; + + __m128i xmmAlphaLo, xmmAlphaHi; + __m128i xmmSrcLo, xmmSrcHi; + __m128i xmmDstLo, xmmDstHi; + __m128i xmmMaskLo, xmmMaskHi; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + cachePrefetch ((__m128i*)pm); + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), + pixMultiply_1x64 (unpack_32_1x64 (m), + expandAlpha_1x64 (unpack_32_1x64 (s))))); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + cachePrefetch ((__m128i*)pm); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)ps); + cachePrefetchNext ((__m128i*)pd); + cachePrefetchNext ((__m128i*)pm); + + xmmDstHi = load128Aligned ((__m128i*)pd); + xmmSrcHi = load128Unaligned ((__m128i*)ps); + xmmMaskHi = load128Unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); + unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); + unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + + expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi); + pixMultiply_2x128 (xmmMaskLo, xmmMaskHi, xmmAlphaLo, xmmAlphaHi, &xmmAlphaLo, &xmmAlphaHi); + + pixMultiply_2x128 (xmmDstLo, xmmDstHi, xmmAlphaLo, xmmAlphaHi, &xmmDstLo, &xmmDstHi); + + save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), + pixMultiply_1x64 (unpack_32_1x64 (m), + expandAlpha_1x64 (unpack_32_1x64 (s))))); + w--; + } +} + +static inline void +coreCombineOutCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w) +{ + uint32_t s, m, d; + + __m128i xmmAlphaLo, xmmAlphaHi; + __m128i xmmSrcLo, xmmSrcHi; + __m128i xmmDstLo, xmmDstHi; + __m128i xmmMaskLo, xmmMaskHi; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + cachePrefetch ((__m128i*)pm); + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)), + negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d))))); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + cachePrefetch ((__m128i*)pm); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)ps); + cachePrefetchNext ((__m128i*)pd); + cachePrefetchNext ((__m128i*)pm); + + xmmDstHi = load128Aligned ((__m128i*)pd); + xmmSrcHi = load128Unaligned ((__m128i*)ps); + xmmMaskHi = load128Unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); + unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); + unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + + expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi); + negate_2x128 (xmmAlphaLo, xmmAlphaHi, &xmmAlphaLo, &xmmAlphaHi); + + pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi); + pixMultiply_2x128 (xmmDstLo, xmmDstHi, xmmAlphaLo, xmmAlphaHi, &xmmDstLo, &xmmDstHi); + + save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)), + negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d))))); + w--; + } +} + +static inline void +coreCombineOutReverseCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w) +{ + uint32_t s, m, d; + + __m128i xmmAlphaLo, xmmAlphaHi; + __m128i xmmSrcLo, xmmSrcHi; + __m128i xmmDstLo, xmmDstHi; + __m128i xmmMaskLo, xmmMaskHi; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + cachePrefetch ((__m128i*)pm); + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), + negate_1x64 (pixMultiply_1x64 (unpack_32_1x64 (m), + expandAlpha_1x64 (unpack_32_1x64 (s)))))); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + cachePrefetch ((__m128i*)pm); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)ps); + cachePrefetchNext ((__m128i*)pd); + cachePrefetchNext ((__m128i*)pm); + + xmmDstHi = load128Aligned ((__m128i*)pd); + xmmSrcHi = load128Unaligned ((__m128i*)ps); + xmmMaskHi = load128Unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); + unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); + unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + + expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi); + + pixMultiply_2x128 (xmmMaskLo, xmmMaskHi, xmmAlphaLo, xmmAlphaHi, &xmmMaskLo, &xmmMaskHi); + + negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + + pixMultiply_2x128 (xmmDstLo, xmmDstHi, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi); + + save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), + negate_1x64 (pixMultiply_1x64 (unpack_32_1x64 (m), + expandAlpha_1x64 (unpack_32_1x64 (s)))))); + w--; + } +} + +static inline uint32_t +coreCombineAtopCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst) +{ + __m64 m = unpack_32_1x64 (mask); + __m64 s = unpack_32_1x64 (src); + __m64 d = unpack_32_1x64 (dst); + __m64 sa = expandAlpha_1x64 (s); + __m64 da = expandAlpha_1x64 (d); + + s = pixMultiply_1x64 (s, m); + m = negate_1x64 (pixMultiply_1x64 (m, sa)); + + return pack_1x64_32 (pixAddMultiply_1x64 (d, m, s, da)); +} + +static inline void +coreCombineAtopCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w) +{ + uint32_t s, m, d; + + __m128i xmmSrcLo, xmmSrcHi; + __m128i xmmDstLo, xmmDstHi; + __m128i xmmAlphaSrcLo, xmmAlphaSrcHi; + __m128i xmmAlphaDstLo, xmmAlphaDstHi; + __m128i xmmMaskLo, xmmMaskHi; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + cachePrefetch ((__m128i*)pm); + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = coreCombineAtopCPixelsse2 (s, m, d); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + cachePrefetch ((__m128i*)pm); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)ps); + cachePrefetchNext ((__m128i*)pd); + cachePrefetchNext ((__m128i*)pm); + + xmmDstHi = load128Aligned ((__m128i*)pd); + xmmSrcHi = load128Unaligned ((__m128i*)ps); + xmmMaskHi = load128Unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); + unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); + unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + + expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi); + expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); + + pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmMaskLo, xmmMaskHi, &xmmSrcLo, &xmmSrcHi); + pixMultiply_2x128 (xmmMaskLo, xmmMaskHi, xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi); + + negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + + pixAddMultiply_2x128 (xmmDstLo, xmmDstHi, xmmMaskLo, xmmMaskHi, + xmmSrcLo, xmmSrcHi, xmmAlphaDstLo, xmmAlphaDstHi, + &xmmDstLo, &xmmDstHi); + + save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = coreCombineAtopCPixelsse2 (s, m, d); + w--; + } +} + +static inline uint32_t +coreCombineReverseAtopCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst) +{ + __m64 m = unpack_32_1x64 (mask); + __m64 s = unpack_32_1x64 (src); + __m64 d = unpack_32_1x64 (dst); + + __m64 da = negate_1x64 (expandAlpha_1x64 (d)); + __m64 sa = expandAlpha_1x64 (s); + + s = pixMultiply_1x64 (s, m); + m = pixMultiply_1x64 (m, sa); + + return pack_1x64_32 (pixAddMultiply_1x64 (d, m, s, da)); +} + +static inline void +coreCombineReverseAtopCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w) +{ + uint32_t s, m, d; + + __m128i xmmSrcLo, xmmSrcHi; + __m128i xmmDstLo, xmmDstHi; + __m128i xmmAlphaSrcLo, xmmAlphaSrcHi; + __m128i xmmAlphaDstLo, xmmAlphaDstHi; + __m128i xmmMaskLo, xmmMaskHi; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + cachePrefetch ((__m128i*)pm); + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = coreCombineReverseAtopCPixelsse2 (s, m, d); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + cachePrefetch ((__m128i*)pm); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)ps); + cachePrefetchNext ((__m128i*)pd); + cachePrefetchNext ((__m128i*)pm); + + xmmDstHi = load128Aligned ((__m128i*)pd); + xmmSrcHi = load128Unaligned ((__m128i*)ps); + xmmMaskHi = load128Unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); + unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); + unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + + expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi); + expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); + + pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmMaskLo, xmmMaskHi, &xmmSrcLo, &xmmSrcHi); + pixMultiply_2x128 (xmmMaskLo, xmmMaskHi, xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi); + + negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); + + pixAddMultiply_2x128 (xmmDstLo, xmmDstHi, xmmMaskLo, xmmMaskHi, + xmmSrcLo, xmmSrcHi, xmmAlphaDstLo, xmmAlphaDstHi, + &xmmDstLo, &xmmDstHi); + + save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = coreCombineReverseAtopCPixelsse2 (s, m, d); + w--; + } +} + +static inline uint32_t +coreCombineXorCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst) +{ + __m64 a = unpack_32_1x64 (mask); + __m64 s = unpack_32_1x64 (src); + __m64 d = unpack_32_1x64 (dst); + + return pack_1x64_32 (pixAddMultiply_1x64 (d, + negate_1x64 (pixMultiply_1x64 (a, expandAlpha_1x64 (s))), + pixMultiply_1x64 (s, a), + negate_1x64 (expandAlpha_1x64 (d)))); +} + +static inline void +coreCombineXorCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w) +{ + uint32_t s, m, d; + + __m128i xmmSrcLo, xmmSrcHi; + __m128i xmmDstLo, xmmDstHi; + __m128i xmmAlphaSrcLo, xmmAlphaSrcHi; + __m128i xmmAlphaDstLo, xmmAlphaDstHi; + __m128i xmmMaskLo, xmmMaskHi; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + cachePrefetch ((__m128i*)pm); + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = coreCombineXorCPixelsse2 (s, m, d); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + cachePrefetch ((__m128i*)pm); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)ps); + cachePrefetchNext ((__m128i*)pd); + cachePrefetchNext ((__m128i*)pm); + + xmmDstHi = load128Aligned ((__m128i*)pd); + xmmSrcHi = load128Unaligned ((__m128i*)ps); + xmmMaskHi = load128Unaligned ((__m128i*)pm); + + unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); + unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); + unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + + expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi); + expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); + + pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmMaskLo, xmmMaskHi, &xmmSrcLo, &xmmSrcHi); + pixMultiply_2x128 (xmmMaskLo, xmmMaskHi, xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi); + + negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); + negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + + pixAddMultiply_2x128 (xmmDstLo, xmmDstHi, xmmMaskLo, xmmMaskHi, + xmmSrcLo, xmmSrcHi, xmmAlphaDstLo, xmmAlphaDstHi, + &xmmDstLo, &xmmDstHi); + + save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = coreCombineXorCPixelsse2 (s, m, d); + w--; + } +} + +static inline void +coreCombineAddCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w) +{ + uint32_t s, m, d; + + __m128i xmmSrcLo, xmmSrcHi; + __m128i xmmDstLo, xmmDstHi; + __m128i xmmMaskLo, xmmMaskHi; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + cachePrefetch ((__m128i*)pm); + + while (w && (unsigned long)pd & 15) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 (_mm_adds_pu8 (pixMultiply_1x64 (unpack_32_1x64 (s), + unpack_32_1x64 (m)), + unpack_32_1x64 (d))); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)ps); + cachePrefetch ((__m128i*)pd); + cachePrefetch ((__m128i*)pm); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)ps); + cachePrefetchNext ((__m128i*)pd); + cachePrefetchNext ((__m128i*)pm); + + xmmSrcHi = load128Unaligned ((__m128i*)ps); + xmmMaskHi = load128Unaligned ((__m128i*)pm); + xmmDstHi = load128Aligned ((__m128i*)pd); + + unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); + unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); + + pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmMaskLo, xmmMaskHi, &xmmSrcLo, &xmmSrcHi); + + save128Aligned( (__m128i*)pd, pack_2x128_128 (_mm_adds_epu8 (xmmSrcLo, xmmDstLo), + _mm_adds_epu8 (xmmSrcHi, xmmDstHi))); + + ps += 4; + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + s = *ps++; + m = *pm++; + d = *pd; + + *pd++ = pack_1x64_32 (_mm_adds_pu8 (pixMultiply_1x64 (unpack_32_1x64 (s), + unpack_32_1x64 (m)), + unpack_32_1x64 (d))); + w--; + } +} + +/* ------------------------------------------------------------------------------------------------- + * fbComposeSetupSSE + */ +static inline __m64 +createMask_16_64 (uint16_t mask) +{ + return _mm_set1_pi16 (mask); +} + +static inline __m128i +createMask_16_128 (uint16_t mask) +{ + return _mm_set1_epi16 (mask); +} + +static inline __m64 +createMask_2x32_64 (uint32_t mask0, uint32_t mask1) +{ + return _mm_set_pi32 (mask0, mask1); +} + +static inline __m128i +createMask_2x32_128 (uint32_t mask0, uint32_t mask1) +{ + return _mm_set_epi32 (mask0, mask1, mask0, mask1); +} + +/* SSE2 code patch for fbcompose.c */ + +static FASTCALL void +sse2CombineMaskU (uint32_t *dst, const uint32_t *src, int width) +{ + coreCombineReverseInUsse2 (dst, src, width); + _mm_empty(); +} + +static FASTCALL void +sse2CombineOverU (uint32_t *dst, const uint32_t *src, int width) +{ + coreCombineOverUsse2 (dst, src, width); + _mm_empty(); +} + +static FASTCALL void +sse2CombineOverReverseU (uint32_t *dst, const uint32_t *src, int width) +{ + coreCombineOverReverseUsse2 (dst, src, width); + _mm_empty(); +} + +static FASTCALL void +sse2CombineInU (uint32_t *dst, const uint32_t *src, int width) +{ + coreCombineInUsse2 (dst, src, width); + _mm_empty(); +} + +static FASTCALL void +sse2CombineInReverseU (uint32_t *dst, const uint32_t *src, int width) +{ + coreCombineReverseInUsse2 (dst, src, width); + _mm_empty(); +} + +static FASTCALL void +sse2CombineOutU (uint32_t *dst, const uint32_t *src, int width) +{ + coreCombineOutUsse2 (dst, src, width); + _mm_empty(); +} + +static FASTCALL void +sse2CombineOutReverseU (uint32_t *dst, const uint32_t *src, int width) +{ + coreCombineReverseOutUsse2 (dst, src, width); + _mm_empty(); +} + +static FASTCALL void +sse2CombineAtopU (uint32_t *dst, const uint32_t *src, int width) +{ + coreCombineAtopUsse2 (dst, src, width); + _mm_empty(); +} + +static FASTCALL void +sse2CombineAtopReverseU (uint32_t *dst, const uint32_t *src, int width) +{ + coreCombineReverseAtopUsse2 (dst, src, width); + _mm_empty(); +} + +static FASTCALL void +sse2CombineXorU (uint32_t *dst, const uint32_t *src, int width) +{ + coreCombineXorUsse2 (dst, src, width); + _mm_empty(); +} + +static FASTCALL void +sse2CombineAddU (uint32_t *dst, const uint32_t *src, int width) +{ + coreCombineAddUsse2 (dst, src, width); + _mm_empty(); +} + +static FASTCALL void +sse2CombineSaturateU (uint32_t *dst, const uint32_t *src, int width) +{ + coreCombineSaturateUsse2 (dst, src, width); + _mm_empty(); +} + +static FASTCALL void +sse2CombineSrcC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) +{ + coreCombineSrcCsse2 (dst, src, mask, width); + _mm_empty(); +} + +static FASTCALL void +sse2CombineOverC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) +{ + coreCombineOverCsse2 (dst, src, mask, width); + _mm_empty(); +} + +static FASTCALL void +sse2CombineOverReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) +{ + coreCombineOverReverseCsse2 (dst, src, mask, width); + _mm_empty(); +} + +static FASTCALL void +sse2CombineInC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) +{ + coreCombineInCsse2 (dst, src, mask, width); + _mm_empty(); +} + +static FASTCALL void +sse2CombineInReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) +{ + coreCombineInReverseCsse2 (dst, src, mask, width); + _mm_empty(); +} + +static FASTCALL void +sse2CombineOutC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) +{ + coreCombineOutCsse2 (dst, src, mask, width); + _mm_empty(); +} + +static FASTCALL void +sse2CombineOutReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) +{ + coreCombineOutReverseCsse2 (dst, src, mask, width); + _mm_empty(); +} + +static FASTCALL void +sse2CombineAtopC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) +{ + coreCombineAtopCsse2 (dst, src, mask, width); + _mm_empty(); +} + +static FASTCALL void +sse2CombineAtopReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) +{ + coreCombineReverseAtopCsse2 (dst, src, mask, width); + _mm_empty(); +} + +static FASTCALL void +sse2CombineXorC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) +{ + coreCombineXorCsse2 (dst, src, mask, width); + _mm_empty(); +} + +static FASTCALL void +sse2CombineAddC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) +{ + coreCombineAddCsse2 (dst, src, mask, width); + _mm_empty(); +} + +void +fbComposeSetupSSE(void) +{ + static pixman_bool_t initialized = FALSE; + + if (initialized) + return; + + /* check if we have SSE2 support and initialize accordingly */ + if (pixman_have_sse()) + { + /* SSE2 constants */ + Mask565r = createMask_2x32_128 (0x00f80000, 0x00f80000); + Mask565g1 = createMask_2x32_128 (0x00070000, 0x00070000); + Mask565g2 = createMask_2x32_128 (0x000000e0, 0x000000e0); + Mask565b = createMask_2x32_128 (0x0000001f, 0x0000001f); + MaskRed = createMask_2x32_128 (0x00f80000, 0x00f80000); + MaskGreen = createMask_2x32_128 (0x0000fc00, 0x0000fc00); + MaskBlue = createMask_2x32_128 (0x000000f8, 0x000000f8); + + Mask0080 = createMask_16_128 (0x0080); + Mask00ff = createMask_16_128 (0x00ff); + Mask0101 = createMask_16_128 (0x0101); + Maskffff = createMask_16_128 (0xffff); + Maskff000000 = createMask_2x32_128 (0xff000000, 0xff000000); + MaskAlpha = createMask_2x32_128 (0x00ff0000, 0x00000000); + + /* MMX constants */ + xMask565rgb = createMask_2x32_64 (0x000001f0, 0x003f001f); + xMask565Unpack = createMask_2x32_64 (0x00000084, 0x04100840); + + xMask0080 = createMask_16_64 (0x0080); + xMask00ff = createMask_16_64 (0x00ff); + xMask0101 = createMask_16_64 (0x0101); + xMaskAlpha = createMask_2x32_64 (0x00ff0000, 0x00000000); + + /* SSE code patch for fbcompose.c */ + pixman_composeFunctions.combineU[PIXMAN_OP_OVER] = sse2CombineOverU; + pixman_composeFunctions.combineU[PIXMAN_OP_OVER_REVERSE] = sse2CombineOverReverseU; + pixman_composeFunctions.combineU[PIXMAN_OP_IN] = sse2CombineInU; + pixman_composeFunctions.combineU[PIXMAN_OP_IN_REVERSE] = sse2CombineInReverseU; + pixman_composeFunctions.combineU[PIXMAN_OP_OUT] = sse2CombineOutU; + + pixman_composeFunctions.combineU[PIXMAN_OP_OUT_REVERSE] = sse2CombineOutReverseU; + pixman_composeFunctions.combineU[PIXMAN_OP_ATOP] = sse2CombineAtopU; + pixman_composeFunctions.combineU[PIXMAN_OP_ATOP_REVERSE] = sse2CombineAtopReverseU; + pixman_composeFunctions.combineU[PIXMAN_OP_XOR] = sse2CombineXorU; + pixman_composeFunctions.combineU[PIXMAN_OP_ADD] = sse2CombineAddU; + + pixman_composeFunctions.combineU[PIXMAN_OP_SATURATE] = sse2CombineSaturateU; + + pixman_composeFunctions.combineC[PIXMAN_OP_SRC] = sse2CombineSrcC; + pixman_composeFunctions.combineC[PIXMAN_OP_OVER] = sse2CombineOverC; + pixman_composeFunctions.combineC[PIXMAN_OP_OVER_REVERSE] = sse2CombineOverReverseC; + pixman_composeFunctions.combineC[PIXMAN_OP_IN] = sse2CombineInC; + pixman_composeFunctions.combineC[PIXMAN_OP_IN_REVERSE] = sse2CombineInReverseC; + pixman_composeFunctions.combineC[PIXMAN_OP_OUT] = sse2CombineOutC; + pixman_composeFunctions.combineC[PIXMAN_OP_OUT_REVERSE] = sse2CombineOutReverseC; + pixman_composeFunctions.combineC[PIXMAN_OP_ATOP] = sse2CombineAtopC; + pixman_composeFunctions.combineC[PIXMAN_OP_ATOP_REVERSE] = sse2CombineAtopReverseC; + pixman_composeFunctions.combineC[PIXMAN_OP_XOR] = sse2CombineXorC; + pixman_composeFunctions.combineC[PIXMAN_OP_ADD] = sse2CombineAddC; + + pixman_composeFunctions.combineMaskU = sse2CombineMaskU; + } + + initialized = TRUE; +} + + +/* ------------------------------------------------------------------------------------------------- + * fbCompositeSolid_nx8888 + */ + +void +fbCompositeSolid_nx8888sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height) +{ + uint32_t src; + uint32_t *dstLine, *dst, d; + uint16_t w; + int dstStride; + __m128i xmmSrc, xmmAlpha; + __m128i xmmDst, xmmDstLo, xmmDstHi; + + fbComposeGetSolid(pSrc, src, pDst->bits.format); + + if (src >> 24 == 0) + return; + + fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); + + xmmSrc = expandPixel_32_1x128 (src); + xmmAlpha = expandAlpha_1x128 (xmmSrc); + + while (height--) + { + dst = dstLine; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)dst); + + dstLine += dstStride; + w = width; + + while (w && (unsigned long)dst & 15) + { + d = *dst; + *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc), + _mm_movepi64_pi64 (xmmAlpha), + unpack_32_1x64 (d))); + w--; + } + + cachePrefetch ((__m128i*)dst); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)dst); + + xmmDst = load128Aligned ((__m128i*)dst); + + unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); + + over_2x128 (xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, &xmmDstLo, &xmmDstHi); + + /* rebuid the 4 pixel data and save*/ + save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); + + w -= 4; + dst += 4; + } + + while (w) + { + d = *dst; + *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc), + _mm_movepi64_pi64 (xmmAlpha), + unpack_32_1x64 (d))); + w--; + } + + } + _mm_empty(); +} + +/* ------------------------------------------------------------------------------------------------- + * fbCompositeSolid_nx0565 + */ +void +fbCompositeSolid_nx0565sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height) +{ + uint32_t src; + uint16_t *dstLine, *dst, d; + uint16_t w; + int dstStride; + __m128i xmmSrc, xmmAlpha; + __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3; + + fbComposeGetSolid(pSrc, src, pDst->bits.format); + + if (src >> 24 == 0) + return; + + fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); + + xmmSrc = expandPixel_32_1x128 (src); + xmmAlpha = expandAlpha_1x128 (xmmSrc); + + while (height--) + { + dst = dstLine; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)dst); + + dstLine += dstStride; + w = width; + + while (w && (unsigned long)dst & 15) + { + d = *dst; + *dst++ = pack565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc), + _mm_movepi64_pi64 (xmmAlpha), + expand565_16_1x64 (d)))); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)dst); + + while (w >= 8) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)dst); + + xmmDst = load128Aligned ((__m128i*)dst); + + unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3); + + over_2x128 (xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, &xmmDst0, &xmmDst1); + over_2x128 (xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, &xmmDst2, &xmmDst3); + + xmmDst = pack565_4x128_128 (xmmDst0, xmmDst1, xmmDst2, xmmDst3); + + save128Aligned ((__m128i*)dst, xmmDst); + + dst += 8; + w -= 8; + } + + while (w--) + { + d = *dst; + *dst++ = pack565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc), + _mm_movepi64_pi64 (xmmAlpha), + expand565_16_1x64 (d)))); + } + } + + _mm_empty(); +} + +/* ------------------------------------------------------------------------------------------------- + * fbCompositeSolidMask_nx8888x8888C + */ + +void +fbCompositeSolidMask_nx8888x8888Csse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height) +{ + uint32_t src, srca; + uint32_t *dstLine, d; + uint32_t *maskLine, m; + uint32_t packCmp; + int dstStride, maskStride; + + __m128i xmmSrc, xmmAlpha; + __m128i xmmDst, xmmDstLo, xmmDstHi; + __m128i xmmMask, xmmMaskLo, xmmMaskHi; + + fbComposeGetSolid(pSrc, src, pDst->bits.format); + + srca = src >> 24; + if (srca == 0) + return; + + fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); + fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1); + + xmmSrc = _mm_unpacklo_epi8 (createMask_2x32_128 (src, src), _mm_setzero_si128 ()); + xmmAlpha = expandAlpha_1x128 (xmmSrc); + + while (height--) + { + int w = width; + uint32_t *pm = (uint32_t *)maskLine; + uint32_t *pd = (uint32_t *)dstLine; + + dstLine += dstStride; + maskLine += maskStride; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)pd); + cachePrefetch ((__m128i*)pm); + + while (w && (unsigned long)pd & 15) + { + m = *pm++; + + if (m) + { + d = *pd; + + *pd = pack_1x64_32 (inOver_1x64 (_mm_movepi64_pi64 (xmmSrc), + _mm_movepi64_pi64 (xmmAlpha), + unpack_32_1x64 (m), + unpack_32_1x64 (d))); + } + + pd++; + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)pd); + cachePrefetch ((__m128i*)pm); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)pd); + cachePrefetchNext ((__m128i*)pm); + + xmmMask = load128Unaligned ((__m128i*)pm); + + packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128())); + + /* if all bits in mask are zero, packCmp are equal to 0xffff */ + if (packCmp != 0xffff) + { + xmmDst = load128Aligned ((__m128i*)pd); + + unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); + unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); + + inOver_2x128 (xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi); + + save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); + } + + pd += 4; + pm += 4; + w -= 4; + } + + while (w) + { + m = *pm++; + + if (m) + { + d = *pd; + + *pd = pack_1x64_32 (inOver_1x64 (_mm_movepi64_pi64 (xmmSrc), + _mm_movepi64_pi64 (xmmAlpha), + unpack_32_1x64 (m), + unpack_32_1x64 (d))); + } + + pd++; + w--; + } + } + + _mm_empty(); +} + + +/* ------------------------------------------------------------------------------------------------- + * fbCompositeSrc_8888x8x8888 + */ + +void +fbCompositeSrc_8888x8x8888sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height) +{ + uint32_t *dstLine, *dst; + uint32_t *srcLine, *src; + uint32_t mask; + uint16_t w; + int dstStride, srcStride; + + __m128i xmmMask; + __m128i xmmSrc, xmmSrcLo, xmmSrcHi; + __m128i xmmDst, xmmDstLo, xmmDstHi; + __m128i xmmAlphaLo, xmmAlphaHi; + + fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); + fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); + fbComposeGetSolid (pMask, mask, pDst->bits.format); + + xmmMask = createMask_16_128 (mask >> 24); + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + src = srcLine; + srcLine += srcStride; + w = width; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)dst); + cachePrefetch ((__m128i*)src); + + while (w && (unsigned long)dst & 15) + { + uint32_t s = *src++; + uint32_t d = *dst; + + __m64 ms = unpack_32_1x64 (s); + + *dst++ = pack_1x64_32 (inOver_1x64 (ms, + expandAlpha_1x64 (ms), + _mm_movepi64_pi64 (xmmMask), + unpack_32_1x64 (d))); + + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)dst); + cachePrefetch ((__m128i*)src); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)dst); + cachePrefetchNext ((__m128i*)src); + + xmmSrc = load128Unaligned ((__m128i*)src); + xmmDst = load128Aligned ((__m128i*)dst); + + unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); + unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); + expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi); + + inOver_2x128 (xmmSrcLo, xmmSrcHi, xmmAlphaLo, xmmAlphaHi, xmmMask, xmmMask, &xmmDstLo, &xmmDstHi); + + save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); + + dst += 4; + src += 4; + w -= 4; + } + + while (w) + { + uint32_t s = *src++; + uint32_t d = *dst; + + __m64 ms = unpack_32_1x64 (s); + + *dst++ = pack_1x64_32 (inOver_1x64 (ms, + expandAlpha_1x64 (ms), + _mm_movepi64_pi64 (xmmMask), + unpack_32_1x64 (d))); + + w--; + } + } + + _mm_empty(); +} + +/* ------------------------------------------------------------------------------------------------- + * fbCompositeSrc_x888xnx8888 + */ +void +fbCompositeSrc_x888xnx8888sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height) +{ + uint32_t *dstLine, *dst; + uint32_t *srcLine, *src; + uint32_t mask; + int dstStride, srcStride; + uint16_t w; + + __m128i xmmMask, xmmAlpha; + __m128i xmmSrc, xmmSrcLo, xmmSrcHi; + __m128i xmmDst, xmmDstLo, xmmDstHi; + + fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); + fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); + fbComposeGetSolid (pMask, mask, pDst->bits.format); + + xmmMask = createMask_16_128 (mask >> 24); + xmmAlpha = Mask00ff; + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + src = srcLine; + srcLine += srcStride; + w = width; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)dst); + cachePrefetch ((__m128i*)src); + + while (w && (unsigned long)dst & 15) + { + uint32_t s = (*src++) | 0xff000000; + uint32_t d = *dst; + + *dst++ = pack_1x64_32 (inOver_1x64 (unpack_32_1x64 (s), + _mm_movepi64_pi64 (xmmAlpha), + _mm_movepi64_pi64 (xmmMask), + unpack_32_1x64 (d))); + + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)dst); + cachePrefetch ((__m128i*)src); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)dst); + cachePrefetchNext ((__m128i*)src); + + xmmSrc = load128Unaligned ((__m128i*)src); + xmmDst = load128Aligned ((__m128i*)dst); + + unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); + unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); + + inOver_2x128 (xmmSrcLo, xmmSrcHi, xmmAlpha, xmmAlpha, xmmMask, xmmMask, &xmmDstLo, &xmmDstHi); + + save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); + + dst += 4; + src += 4; + w -= 4; + + } + + while (w) + { + uint32_t s = (*src++) | 0xff000000; + uint32_t d = *dst; + + *dst++ = pack_1x64_32 (inOver_1x64 (unpack_32_1x64 (s), + _mm_movepi64_pi64 (xmmAlpha), + _mm_movepi64_pi64 (xmmMask), + unpack_32_1x64 (d))); + + w--; + } + } + + _mm_empty(); +} + +/* ------------------------------------------------------------------------------------------------- + * fbCompositeSrc_8888x8888 + */ +void +fbCompositeSrc_8888x8888sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height) +{ + int dstStride, srcStride; + uint32_t *dstLine, *dst; + uint32_t *srcLine, *src; + + fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); + fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); + + dst = dstLine; + src = srcLine; + + while (height--) + { + coreCombineOverUsse2 (dst, src, width); + + dst += dstStride; + src += srcStride; + } + _mm_empty(); +} + +/* ------------------------------------------------------------------------------------------------- + * fbCompositeSrc_8888x0565 + */ +static inline uint16_t +fbCompositeSrc_8888x0565pixel (uint32_t src, uint16_t dst) +{ + __m64 ms; + + ms = unpack_32_1x64 (src); + return pack565_32_16( pack_1x64_32 (over_1x64 (ms, + expandAlpha_1x64 (ms), + expand565_16_1x64 (dst)))); +} + +void +fbCompositeSrc_8888x0565sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height) +{ + uint16_t *dstLine, *dst, d; + uint32_t *srcLine, *src, s; + int dstStride, srcStride; + uint16_t w; + + __m128i xmmAlphaLo, xmmAlphaHi; + __m128i xmmSrc, xmmSrcLo, xmmSrcHi; + __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3; + + fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); + fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); + +#if 0 + /* FIXME + * + * I copy the code from MMX one and keep the fixme. + * If it's a problem there, probably is a problem here. + */ + assert (pSrc->pDrawable == pMask->pDrawable); +#endif + + while (height--) + { + dst = dstLine; + src = srcLine; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)src); + cachePrefetch ((__m128i*)dst); + + dstLine += dstStride; + srcLine += srcStride; + w = width; + + /* Align dst on a 16-byte boundary */ + while (w && + ((unsigned long)dst & 15)) + { + s = *src++; + d = *dst; + + *dst++ = fbCompositeSrc_8888x0565pixel (s, d); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)src); + cachePrefetch ((__m128i*)dst); + + /* It's a 8 pixel loop */ + while (w >= 8) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)src); + cachePrefetchNext ((__m128i*)dst); + + /* I'm loading unaligned because I'm not sure about the address alignment. */ + xmmSrc = load128Unaligned ((__m128i*) src); + xmmDst = load128Aligned ((__m128i*) dst); + + /* Unpacking */ + unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); + unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3); + expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi); + + /* I'm loading next 4 pixels from memory before to optimze the memory read. */ + xmmSrc = load128Unaligned ((__m128i*) (src+4)); + + over_2x128 (xmmSrcLo, xmmSrcHi, xmmAlphaLo, xmmAlphaHi, &xmmDst0, &xmmDst1); + + /* Unpacking */ + unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); + expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi); + + over_2x128 (xmmSrcLo, xmmSrcHi, xmmAlphaLo, xmmAlphaHi, &xmmDst2, &xmmDst3); + + save128Aligned ((__m128i*)dst, pack565_4x128_128 (xmmDst0, xmmDst1, xmmDst2, xmmDst3)); + + w -= 8; + dst += 8; + src += 8; + } + + while (w--) + { + s = *src++; + d = *dst; + + *dst++ = fbCompositeSrc_8888x0565pixel (s, d); + } + } + + _mm_empty(); +} + +/* ------------------------------------------------------------------------------------------------- + * fbCompositeSolidMask_nx8x8888 + */ + +void +fbCompositeSolidMask_nx8x8888sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height) +{ + uint32_t src, srca; + uint32_t *dstLine, *dst; + uint8_t *maskLine, *mask; + int dstStride, maskStride; + uint16_t w; + uint32_t m, d; + + __m128i xmmSrc, xmmAlpha, xmmDef; + __m128i xmmDst, xmmDstLo, xmmDstHi; + __m128i xmmMask, xmmMaskLo, xmmMaskHi; + + fbComposeGetSolid(pSrc, src, pDst->bits.format); + + srca = src >> 24; + if (srca == 0) + return; + + fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); + fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); + + xmmDef = createMask_2x32_128 (src, src); + xmmSrc = expandPixel_32_1x128 (src); + xmmAlpha = expandAlpha_1x128 (xmmSrc); + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + mask = maskLine; + maskLine += maskStride; + w = width; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)mask); + cachePrefetch ((__m128i*)dst); + + while (w && (unsigned long)dst & 15) + { + uint8_t m = *mask++; + + if (m) + { + d = *dst; + + *dst = pack_1x64_32 (inOver_1x64 (_mm_movepi64_pi64 (xmmSrc), + _mm_movepi64_pi64 (xmmAlpha), + expandPixel_8_1x64 (m), + unpack_32_1x64 (d))); + } + + w--; + dst++; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)mask); + cachePrefetch ((__m128i*)dst); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)mask); + cachePrefetchNext ((__m128i*)dst); + + m = *((uint32_t*)mask); + + if (srca == 0xff && m == 0xffffffff) + { + save128Aligned ((__m128i*)dst, xmmDef); + } + else if (m) + { + xmmDst = load128Aligned ((__m128i*) dst); + xmmMask = unpack_32_1x128 (m); + xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128()); + + /* Unpacking */ + unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); + unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); + + expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + + inOver_2x128 (xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi); + + save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); + } + + w -= 4; + dst += 4; + mask += 4; + } + + while (w) + { + uint8_t m = *mask++; + + if (m) + { + d = *dst; + + *dst = pack_1x64_32 (inOver_1x64 (_mm_movepi64_pi64 (xmmSrc), + _mm_movepi64_pi64 (xmmAlpha), + expandPixel_8_1x64 (m), + unpack_32_1x64 (d))); + } + + w--; + dst++; + } + } + + _mm_empty(); +} + +/* ------------------------------------------------------------------------------------------------- + * fbCompositeSolidMask_nx8x8888 + */ + +pixman_bool_t +pixmanFillsse2 (uint32_t *bits, + int stride, + int bpp, + int x, + int y, + int width, + int height, + uint32_t data) +{ + uint32_t byte_width; + uint8_t *byte_line; + + __m128i xmmDef; + + if (bpp == 16 && (data >> 16 != (data & 0xffff))) + return FALSE; + + if (bpp != 16 && bpp != 32) + return FALSE; + + if (bpp == 16) + { + stride = stride * (int) sizeof (uint32_t) / 2; + byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); + byte_width = 2 * width; + stride *= 2; + } + else + { + stride = stride * (int) sizeof (uint32_t) / 4; + byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); + byte_width = 4 * width; + stride *= 4; + } + + cachePrefetch ((__m128i*)byte_line); + xmmDef = createMask_2x32_128 (data, data); + + while (height--) + { + int w; + uint8_t *d = byte_line; + byte_line += stride; + w = byte_width; + + + cachePrefetchNext ((__m128i*)d); + + while (w >= 2 && ((unsigned long)d & 3)) + { + *(uint16_t *)d = data; + w -= 2; + d += 2; + } + + while (w >= 4 && ((unsigned long)d & 15)) + { + *(uint32_t *)d = data; + + w -= 4; + d += 4; + } + + cachePrefetchNext ((__m128i*)d); + + while (w >= 128) + { + cachePrefetch (((__m128i*)d) + 12); + + save128Aligned ((__m128i*)(d), xmmDef); + save128Aligned ((__m128i*)(d+16), xmmDef); + save128Aligned ((__m128i*)(d+32), xmmDef); + save128Aligned ((__m128i*)(d+48), xmmDef); + save128Aligned ((__m128i*)(d+64), xmmDef); + save128Aligned ((__m128i*)(d+80), xmmDef); + save128Aligned ((__m128i*)(d+96), xmmDef); + save128Aligned ((__m128i*)(d+112), xmmDef); + + d += 128; + w -= 128; + } + + if (w >= 64) + { + cachePrefetch (((__m128i*)d) + 8); + + save128Aligned ((__m128i*)(d), xmmDef); + save128Aligned ((__m128i*)(d+16), xmmDef); + save128Aligned ((__m128i*)(d+32), xmmDef); + save128Aligned ((__m128i*)(d+48), xmmDef); + + d += 64; + w -= 64; + } + + cachePrefetchNext ((__m128i*)d); + + if (w >= 32) + { + save128Aligned ((__m128i*)(d), xmmDef); + save128Aligned ((__m128i*)(d+16), xmmDef); + + d += 32; + w -= 32; + } + + if (w >= 16) + { + save128Aligned ((__m128i*)(d), xmmDef); + + d += 16; + w -= 16; + } + + cachePrefetchNext ((__m128i*)d); + + while (w >= 4) + { + *(uint32_t *)d = data; + + w -= 4; + d += 4; + } + + if (w >= 2) + { + *(uint16_t *)d = data; + w -= 2; + d += 2; + } + } + + _mm_empty(); + return TRUE; +} + +void +fbCompositeSolidMaskSrc_nx8x8888sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height) +{ + uint32_t src, srca; + uint32_t *dstLine, *dst; + uint8_t *maskLine, *mask; + int dstStride, maskStride; + uint16_t w; + uint32_t m; + + __m128i xmmSrc, xmmDef; + __m128i xmmMask, xmmMaskLo, xmmMaskHi; + + fbComposeGetSolid(pSrc, src, pDst->bits.format); + + srca = src >> 24; + if (srca == 0) + { + pixmanFillsse2 (pDst->bits.bits, pDst->bits.rowstride, + PIXMAN_FORMAT_BPP (pDst->bits.format), + xDst, yDst, width, height, 0); + return; + } + + fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); + fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); + + xmmDef = createMask_2x32_128 (src, src); + xmmSrc = expandPixel_32_1x128 (src); + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + mask = maskLine; + maskLine += maskStride; + w = width; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)mask); + cachePrefetch ((__m128i*)dst); + + while (w && (unsigned long)dst & 15) + { + uint8_t m = *mask++; + + if (m) + { + *dst = pack_1x64_32 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmSrc), expandPixel_8_1x64 (m))); + } + else + { + *dst = 0; + } + + w--; + dst++; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)mask); + cachePrefetch ((__m128i*)dst); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)mask); + cachePrefetchNext ((__m128i*)dst); + + m = *((uint32_t*)mask); + + if (srca == 0xff && m == 0xffffffff) + { + save128Aligned ((__m128i*)dst, xmmDef); + } + else if (m) + { + xmmMask = unpack_32_1x128 (m); + xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128()); + + /* Unpacking */ + unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); + + expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + + pixMultiply_2x128 (xmmSrc, xmmSrc, xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + + save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmMaskLo, xmmMaskHi)); + } + else + { + save128Aligned ((__m128i*)dst, _mm_setzero_si128()); + } + + w -= 4; + dst += 4; + mask += 4; + } + + while (w) + { + uint8_t m = *mask++; + + if (m) + { + *dst = pack_1x64_32 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmSrc), expandPixel_8_1x64 (m))); + } + else + { + *dst = 0; + } + + w--; + dst++; + } + } + + _mm_empty(); +} + +/* ------------------------------------------------------------------------------------------------- + * fbCompositeSolidMask_nx8x0565 + */ + +void +fbCompositeSolidMask_nx8x0565sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height) +{ + uint32_t src, srca; + uint16_t *dstLine, *dst, d; + uint8_t *maskLine, *mask; + int dstStride, maskStride; + uint16_t w; + uint32_t m; + + __m128i xmmSrc, xmmAlpha; + __m128i xmmMask, xmmMaskLo, xmmMaskHi; + __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3; + + fbComposeGetSolid(pSrc, src, pDst->bits.format); + + srca = src >> 24; + if (srca == 0) + return; + + fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); + fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); + + xmmSrc = expandPixel_32_1x128 (src); + xmmAlpha = expandAlpha_1x128 (xmmSrc); + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + mask = maskLine; + maskLine += maskStride; + w = width; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)mask); + cachePrefetch ((__m128i*)dst); + + while (w && (unsigned long)dst & 15) + { + m = *mask++; + + if (m) + { + d = *dst; + + *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (_mm_movepi64_pi64 (xmmSrc), + _mm_movepi64_pi64 (xmmAlpha), + expandAlphaRev_1x64 (unpack_32_1x64 (m)), + expand565_16_1x64 (d)))); + } + + w--; + dst++; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)mask); + cachePrefetch ((__m128i*)dst); + + while (w >= 8) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)mask); + cachePrefetchNext ((__m128i*)dst); + + xmmDst = load128Aligned ((__m128i*) dst); + unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3); + + m = *((uint32_t*)mask); + mask += 4; + + if (m) + { + xmmMask = unpack_32_1x128 (m); + xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128()); + + /* Unpacking */ + unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); + + expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + inOver_2x128 (xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, xmmMaskLo, xmmMaskHi, &xmmDst0, &xmmDst1); + } + + m = *((uint32_t*)mask); + mask += 4; + + if (m) + { + xmmMask = unpack_32_1x128 (m); + xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128()); + + /* Unpacking */ + unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); + + expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + inOver_2x128 (xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, xmmMaskLo, xmmMaskHi, &xmmDst2, &xmmDst3); + } + + save128Aligned ((__m128i*)dst, pack565_4x128_128 (xmmDst0, xmmDst1, xmmDst2, xmmDst3)); + + w -= 8; + dst += 8; + } + + while (w) + { + m = *mask++; + + if (m) + { + d = *dst; + + *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (_mm_movepi64_pi64 (xmmSrc), + _mm_movepi64_pi64 (xmmAlpha), + expandAlphaRev_1x64 (unpack_32_1x64 (m)), + expand565_16_1x64 (d)))); + } + + w--; + dst++; + } + } + + _mm_empty(); +} + +/* ------------------------------------------------------------------------------------------------- + * fbCompositeSrc_8888RevNPx0565 + */ + +void +fbCompositeSrc_8888RevNPx0565sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height) +{ + uint16_t *dstLine, *dst, d; + uint32_t *srcLine, *src, s; + int dstStride, srcStride; + uint16_t w; + uint32_t packCmp; + + __m64 ms; + __m128i xmmSrc, xmmSrcLo, xmmSrcHi; + __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3; + + fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); + fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); + +#if 0 + /* FIXME + * + * I copy the code from MMX one and keep the fixme. + * If it's a problem there, probably is a problem here. + */ + assert (pSrc->pDrawable == pMask->pDrawable); +#endif + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + src = srcLine; + srcLine += srcStride; + w = width; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)src); + cachePrefetch ((__m128i*)dst); + + while (w && (unsigned long)dst & 15) + { + s = *src++; + d = *dst; + + ms = unpack_32_1x64 (s); + + *dst++ = pack565_32_16 (pack_1x64_32 (overRevNonPre_1x64(ms, expand565_16_1x64 (d)))); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)src); + cachePrefetch ((__m128i*)dst); + + while (w >= 8) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)src); + cachePrefetchNext ((__m128i*)dst); + + /* First round */ + xmmSrc = load128Unaligned((__m128i*)src); + xmmDst = load128Aligned ((__m128i*)dst); + + packCmp = packAlpha (xmmSrc); + + unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3); + unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); + + /* preload next round*/ + xmmSrc = load128Unaligned((__m128i*)(src+4)); + /* preload next round*/ + + if (packCmp == 0xffffffff) + { + invertColors_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst0, &xmmDst1); + } + else if (packCmp) + { + overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst0, &xmmDst1); + } + + /* Second round */ + packCmp = packAlpha (xmmSrc); + + unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); + + if (packCmp == 0xffffffff) + { + invertColors_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst2, &xmmDst3); + } + else if (packCmp) + { + overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst2, &xmmDst3); + } + + save128Aligned ((__m128i*)dst, pack565_4x128_128 (xmmDst0, xmmDst1, xmmDst2, xmmDst3)); + + w -= 8; + src += 8; + dst += 8; + } + + while (w) + { + s = *src++; + d = *dst; + + ms = unpack_32_1x64 (s); + + *dst++ = pack565_32_16 (pack_1x64_32 (overRevNonPre_1x64(ms, expand565_16_1x64 (d)))); + w--; + } + } + + _mm_empty(); +} + +/* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */ + +/* ------------------------------------------------------------------------------------------------- + * fbCompositeSrc_8888RevNPx8888 + */ + +void +fbCompositeSrc_8888RevNPx8888sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height) +{ + uint32_t *dstLine, *dst, d; + uint32_t *srcLine, *src, s; + int dstStride, srcStride; + uint16_t w; + uint32_t packCmp; + + __m128i xmmSrcLo, xmmSrcHi; + __m128i xmmDstLo, xmmDstHi; + + fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); + fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); + +#if 0 + /* FIXME + * + * I copy the code from MMX one and keep the fixme. + * If it's a problem there, probably is a problem here. + */ + assert (pSrc->pDrawable == pMask->pDrawable); +#endif + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + src = srcLine; + srcLine += srcStride; + w = width; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)src); + cachePrefetch ((__m128i*)dst); + + while (w && (unsigned long)dst & 15) + { + s = *src++; + d = *dst; + + *dst++ = pack_1x64_32 (overRevNonPre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d))); + + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)src); + cachePrefetch ((__m128i*)dst); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)src); + cachePrefetchNext ((__m128i*)dst); + + xmmSrcHi = load128Unaligned((__m128i*)src); + + packCmp = packAlpha (xmmSrcHi); + + unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); + + if (packCmp == 0xffffffff) + { + invertColors_2x128( xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi); + + save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); + } + else if (packCmp) + { + xmmDstHi = load128Aligned ((__m128i*)dst); + + unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); + + overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi); + + save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); + } + + w -= 4; + dst += 4; + src += 4; + } + + while (w) + { + s = *src++; + d = *dst; + + *dst++ = pack_1x64_32 (overRevNonPre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d))); + + w--; + } + } + + _mm_empty(); +} + +/* ------------------------------------------------------------------------------------------------- + * fbCompositeSolidMask_nx8888x0565C + */ + +void +fbCompositeSolidMask_nx8888x0565Csse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height) +{ + uint32_t src, srca; + uint16_t *dstLine, *dst, d; + uint32_t *maskLine, *mask, m; + int dstStride, maskStride; + int w; + uint32_t packCmp; + + __m128i xmmSrc, xmmAlpha; + __m128i xmmMask, xmmMaskLo, xmmMaskHi; + __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3; + + fbComposeGetSolid(pSrc, src, pDst->bits.format); + + srca = src >> 24; + if (srca == 0) + return; + + fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); + fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1); + + xmmSrc = unpack_32_1x128 (src); + xmmAlpha = expandAlpha_1x128 (xmmSrc); + + while (height--) + { + w = width; + mask = maskLine; + dst = dstLine; + maskLine += maskStride; + dstLine += dstStride; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)mask); + cachePrefetch ((__m128i*)dst); + + while (w && ((unsigned long)dst & 15)) + { + m = *(uint32_t *) mask; + + if (m) + { + d = *dst; + + *dst++ = pack565_32_16 (pack_1x64_32 (inOver_1x64 (_mm_movepi64_pi64 (xmmSrc), + _mm_movepi64_pi64 (xmmAlpha), + unpack_32_1x64 (m), + expand565_16_1x64 (d)))); + } + + w--; + dst++; + mask++; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)mask); + cachePrefetch ((__m128i*)dst); + + while (w >= 8) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)mask); + cachePrefetchNext ((__m128i*)dst); + + /* First round */ + xmmMask = load128Unaligned((__m128i*)mask); + xmmDst = load128Aligned((__m128i*)dst); + + packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128())); + + unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3); + unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); + + /* preload next round*/ + xmmMask = load128Unaligned((__m128i*)(mask+4)); + /* preload next round*/ + + if (packCmp != 0xffff) + { + inOver_2x128(xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, xmmMaskLo, xmmMaskHi, &xmmDst0, &xmmDst1); + } + + /* Second round */ + packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128())); + + unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); + + if (packCmp != 0xffff) + { + inOver_2x128(xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, xmmMaskLo, xmmMaskHi, &xmmDst2, &xmmDst3); + } + + save128Aligned ((__m128i*)dst, pack565_4x128_128 (xmmDst0, xmmDst1, xmmDst2, xmmDst3)); + + w -= 8; + dst += 8; + mask += 8; + } + + while (w) + { + m = *(uint32_t *) mask; + + if (m) + { + d = *dst; + + *dst++ = pack565_32_16 (pack_1x64_32 (inOver_1x64 (_mm_movepi64_pi64 (xmmSrc), + _mm_movepi64_pi64 (xmmAlpha), + unpack_32_1x64 (m), + expand565_16_1x64 (d)))); + } + + w--; + dst++; + mask++; + } + } + + _mm_empty (); +} + +/* ------------------------------------------------------------------------------------------------- + * fbCompositeIn_nx8x8 + */ + +void +fbCompositeIn_nx8x8sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height) +{ + uint8_t *dstLine, *dst; + uint8_t *maskLine, *mask; + int dstStride, maskStride; + uint16_t w, d, m; + uint32_t src; + uint8_t sa; + + __m128i xmmAlpha; + __m128i xmmMask, xmmMaskLo, xmmMaskHi; + __m128i xmmDst, xmmDstLo, xmmDstHi; + + fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1); + fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); + + fbComposeGetSolid(pSrc, src, pDst->bits.format); + + sa = src >> 24; + if (sa == 0) + return; + + xmmAlpha = expandAlpha_1x128 (expandPixel_32_1x128 (src)); + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + mask = maskLine; + maskLine += maskStride; + w = width; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)mask); + cachePrefetch ((__m128i*)dst); + + while (w && ((unsigned long)dst & 15)) + { + m = (uint32_t) *mask++; + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)), + unpack_32_1x64 (d))); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)mask); + cachePrefetch ((__m128i*)dst); + + while (w >= 16) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)mask); + cachePrefetchNext ((__m128i*)dst); + + xmmMask = load128Unaligned((__m128i*)mask); + xmmDst = load128Aligned((__m128i*)dst); + + unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); + unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); + + pixMultiply_2x128 (xmmAlpha, xmmAlpha, xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + pixMultiply_2x128 (xmmMaskLo, xmmMaskHi, xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi); + + save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); + + mask += 16; + dst += 16; + w -= 16; + } + + while (w) + { + m = (uint32_t) *mask++; + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)), + unpack_32_1x64 (d))); + w--; + } + } + + _mm_empty(); +} + +/* ------------------------------------------------------------------------------------------------- + * fbCompositeIn_8x8 + */ + +void +fbCompositeIn_8x8sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height) +{ + uint8_t *dstLine, *dst; + uint8_t *srcLine, *src; + int srcStride, dstStride; + uint16_t w; + uint32_t s, d; + + __m128i xmmSrc, xmmSrcLo, xmmSrcHi; + __m128i xmmDst, xmmDstLo, xmmDstHi; + + fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1); + fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1); + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + src = srcLine; + srcLine += srcStride; + w = width; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)src); + cachePrefetch ((__m128i*)dst); + + while (w && ((unsigned long)dst & 15)) + { + s = (uint32_t) *src++; + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d))); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)src); + cachePrefetch ((__m128i*)dst); + + while (w >= 16) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)src); + cachePrefetchNext ((__m128i*)dst); + + xmmSrc = load128Unaligned((__m128i*)src); + xmmDst = load128Aligned((__m128i*)dst); + + unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); + unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); + + pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi); + + save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); + + src += 16; + dst += 16; + w -= 16; + } + + while (w) + { + s = (uint32_t) *src++; + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d))); + w--; + } + } + + _mm_empty (); +} + +/* ------------------------------------------------------------------------------------------------- + * fbCompositeSrcAdd_8888x8x8 + */ + +void +fbCompositeSrcAdd_8888x8x8sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height) +{ + uint8_t *dstLine, *dst; + uint8_t *maskLine, *mask; + int dstStride, maskStride; + uint16_t w; + uint32_t src; + uint8_t sa; + uint32_t m, d; + + __m128i xmmAlpha; + __m128i xmmMask, xmmMaskLo, xmmMaskHi; + __m128i xmmDst, xmmDstLo, xmmDstHi; + + fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1); + fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); + + fbComposeGetSolid(pSrc, src, pDst->bits.format); + + sa = src >> 24; + if (sa == 0) + return; + + xmmAlpha = expandAlpha_1x128 (expandPixel_32_1x128 (src)); + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + mask = maskLine; + maskLine += maskStride; + w = width; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)mask); + cachePrefetch ((__m128i*)dst); + + while (w && ((unsigned long)dst & 15)) + { + m = (uint32_t) *mask++; + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)), + unpack_32_1x64 (d))); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)mask); + cachePrefetch ((__m128i*)dst); + + while (w >= 16) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)mask); + cachePrefetchNext ((__m128i*)dst); + + xmmMask = load128Unaligned((__m128i*)mask); + xmmDst = load128Aligned((__m128i*)dst); + + unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); + unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); + + pixMultiply_2x128 (xmmAlpha, xmmAlpha, xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + + xmmDstLo = _mm_adds_epu16 (xmmMaskLo, xmmDstLo); + xmmDstHi = _mm_adds_epu16 (xmmMaskHi, xmmDstHi); + + save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); + + mask += 16; + dst += 16; + w -= 16; + } + + while (w) + { + m = (uint32_t) *mask++; + d = (uint32_t) *dst; + + *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)), + unpack_32_1x64 (d))); + w--; + } + } + + _mm_empty(); +} + +/* ------------------------------------------------------------------------------------------------- + * fbCompositeSrcAdd_8000x8000 + */ + +void +fbCompositeSrcAdd_8000x8000sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height) +{ + uint8_t *dstLine, *dst; + uint8_t *srcLine, *src; + int dstStride, srcStride; + uint16_t w; + uint16_t t; + + fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1); + fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1); + + while (height--) + { + dst = dstLine; + src = srcLine; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)src); + cachePrefetch ((__m128i*)dst); + + dstLine += dstStride; + srcLine += srcStride; + w = width; + + /* Small head */ + while (w && (unsigned long)dst & 3) + { + t = (*dst) + (*src++); + *dst++ = t | (0 - (t >> 8)); + w--; + } + + coreCombineAddUsse2 ((uint32_t*)dst, (uint32_t*)src, w >> 2); + + /* Small tail */ + dst += w & 0xfffc; + src += w & 0xfffc; + + w &= 3; + + while (w) + { + t = (*dst) + (*src++); + *dst++ = t | (0 - (t >> 8)); + w--; + } + } + + _mm_empty(); +} + +/* ------------------------------------------------------------------------------------------------- + * fbCompositeSrcAdd_8888x8888 + */ +void +fbCompositeSrcAdd_8888x8888sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height) +{ + uint32_t *dstLine, *dst; + uint32_t *srcLine, *src; + int dstStride, srcStride; + + fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); + fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + src = srcLine; + srcLine += srcStride; + + coreCombineAddUsse2 (dst, src, width); + } + + _mm_empty(); +} + +/* ------------------------------------------------------------------------------------------------- + * fbCompositeCopyAreasse2 + */ + +pixman_bool_t +pixmanBltsse2 (uint32_t *src_bits, + uint32_t *dst_bits, + int src_stride, + int dst_stride, + int src_bpp, + int dst_bpp, + int src_x, int src_y, + int dst_x, int dst_y, + int width, int height) +{ + uint8_t * src_bytes; + uint8_t * dst_bytes; + int byte_width; + + if (src_bpp != dst_bpp) + return FALSE; + + if (src_bpp == 16) + { + src_stride = src_stride * (int) sizeof (uint32_t) / 2; + dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; + src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); + dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); + byte_width = 2 * width; + src_stride *= 2; + dst_stride *= 2; + } + else if (src_bpp == 32) + { + src_stride = src_stride * (int) sizeof (uint32_t) / 4; + dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; + src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); + dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); + byte_width = 4 * width; + src_stride *= 4; + dst_stride *= 4; + } + else + { + return FALSE; + } + + cachePrefetch ((__m128i*)src_bytes); + cachePrefetch ((__m128i*)dst_bytes); + + while (height--) + { + int w; + uint8_t *s = src_bytes; + uint8_t *d = dst_bytes; + src_bytes += src_stride; + dst_bytes += dst_stride; + w = byte_width; + + cachePrefetchNext ((__m128i*)s); + cachePrefetchNext ((__m128i*)d); + + while (w >= 2 && ((unsigned long)d & 3)) + { + *(uint16_t *)d = *(uint16_t *)s; + w -= 2; + s += 2; + d += 2; + } + + while (w >= 4 && ((unsigned long)d & 15)) + { + *(uint32_t *)d = *(uint32_t *)s; + + w -= 4; + s += 4; + d += 4; + } + + cachePrefetchNext ((__m128i*)s); + cachePrefetchNext ((__m128i*)d); + + while (w >= 64) + { + /* 128 bytes ahead */ + cachePrefetch (((__m128i*)s) + 8); + cachePrefetch (((__m128i*)d) + 8); + + __m128i xmm0, xmm1, xmm2, xmm3; + + xmm0 = load128Unaligned ((__m128i*)(s)); + xmm1 = load128Unaligned ((__m128i*)(s+16)); + xmm2 = load128Unaligned ((__m128i*)(s+32)); + xmm3 = load128Unaligned ((__m128i*)(s+48)); + + save128Aligned ((__m128i*)(d), xmm0); + save128Aligned ((__m128i*)(d+16), xmm1); + save128Aligned ((__m128i*)(d+32), xmm2); + save128Aligned ((__m128i*)(d+48), xmm3); + + s += 64; + d += 64; + w -= 64; + } + + cachePrefetchNext ((__m128i*)s); + cachePrefetchNext ((__m128i*)d); + + while (w >= 16) + { + save128Aligned ((__m128i*)d, load128Unaligned ((__m128i*)s) ); + + w -= 16; + d += 16; + s += 16; + } + + cachePrefetchNext ((__m128i*)s); + cachePrefetchNext ((__m128i*)d); + + while (w >= 4) + { + *(uint32_t *)d = *(uint32_t *)s; + + w -= 4; + s += 4; + d += 4; + } + + if (w >= 2) + { + *(uint16_t *)d = *(uint16_t *)s; + w -= 2; + s += 2; + d += 2; + } + } + + _mm_empty(); + + return TRUE; +} + +void +fbCompositeCopyAreasse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height) +{ + pixmanBltsse2 (pSrc->bits.bits, + pDst->bits.bits, + pSrc->bits.rowstride, + pDst->bits.rowstride, + PIXMAN_FORMAT_BPP (pSrc->bits.format), + PIXMAN_FORMAT_BPP (pDst->bits.format), + xSrc, ySrc, xDst, yDst, width, height); +} + +#if 0 +/* This code are buggy in MMX version, now the bug was translated to SSE2 version */ +void +fbCompositeOver_x888x8x8888sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height) +{ + uint32_t *src, *srcLine, s; + uint32_t *dst, *dstLine, d; + uint8_t *mask, *maskLine; + uint32_t m; + int srcStride, maskStride, dstStride; + uint16_t w; + + __m128i xmmSrc, xmmSrcLo, xmmSrcHi; + __m128i xmmDst, xmmDstLo, xmmDstHi; + __m128i xmmMask, xmmMaskLo, xmmMaskHi; + + fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); + fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); + fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); + + while (height--) + { + src = srcLine; + srcLine += srcStride; + dst = dstLine; + dstLine += dstStride; + mask = maskLine; + maskLine += maskStride; + + w = width; + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)src); + cachePrefetch ((__m128i*)dst); + cachePrefetch ((__m128i*)mask); + + while (w && (unsigned long)dst & 15) + { + s = 0xff000000 | *src++; + m = (uint32_t) *mask++; + d = *dst; + + __m64 ms = unpack_32_1x64 (s); + + if (m != 0xff) + { + ms = inOver_1x64 (ms, + xMask00ff, + expandAlphaRev_1x64 (unpack_32_1x64 (m)), + unpack_32_1x64 (d)); + } + + *dst++ = pack_1x64_32 (ms); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cachePrefetch ((__m128i*)src); + cachePrefetch ((__m128i*)dst); + cachePrefetch ((__m128i*)mask); + + while (w >= 4) + { + /* fill cache line with next memory */ + cachePrefetchNext ((__m128i*)src); + cachePrefetchNext ((__m128i*)dst); + cachePrefetchNext ((__m128i*)mask); + + m = *(uint32_t*) mask; + xmmSrc = _mm_or_si128 (load128Unaligned ((__m128i*)src), Maskff000000); + + if (m == 0xffffffff) + { + save128Aligned ((__m128i*)dst, xmmSrc); + } + else + { + xmmDst = load128Aligned ((__m128i*)dst); + + xmmMask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); + + unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); + unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); + unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); + + expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); + + inOver_2x128 (xmmSrcLo, xmmSrcHi, Mask00ff, Mask00ff, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi); + + save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); + } + + src += 4; + dst += 4; + mask += 4; + w -= 4; + } + + while (w) + { + m = (uint32_t) *mask++; + + if (m) + { + s = 0xff000000 | *src; + + if (m == 0xff) + { + *dst = s; + } + else + { + d = *dst; + + *dst = pack_1x64_32 (inOver_1x64 (unpack_32_1x64 (s), + xMask00ff, + expandAlphaRev_1x64 (unpack_32_1x64 (m)), + unpack_32_1x64 (d))); + } + + } + + src++; + dst++; + w--; + } + } + + _mm_empty(); +} +#endif /* #if 0 */ #endif /* USE_SSE2 */ diff --git a/pixman/pixman-sse.h b/pixman/pixman-sse.h index 65cf21d..acec621 100644 --- a/pixman/pixman-sse.h +++ b/pixman/pixman-sse.h @@ -1,5 +1,6 @@ /* * Copyright © 2008 Rodrigo Kumpera + * Copyright © 2008 André Tupinambá * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that @@ -21,7 +22,9 @@ * SOFTWARE. * * Author: Rodrigo Kumpera (kumpera@gmail.com) + * André Tupinambá (andrelrt@gmail.com) * + * Based on work by Owen Taylor and Søren Sandmann */ #ifndef _PIXMAN_SSE_H_ #define _PIXMAN_SSE_H_ @@ -48,6 +51,308 @@ pixman_bool_t pixman_have_sse(void); void fbComposeSetupSSE(void); +pixman_bool_t +pixmanFillsse2 (uint32_t *bits, + int stride, + int bpp, + int x, + int y, + int width, + int height, + uint32_t data); + +pixman_bool_t +pixmanBltsse2 (uint32_t *src_bits, + uint32_t *dst_bits, + int src_stride, + int dst_stride, + int src_bpp, + int dst_bpp, + int src_x, int src_y, + int dst_x, int dst_y, + int width, int height); + +void +fbCompositeSolid_nx8888sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height); + +void +fbCompositeSolid_nx0565sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height); + +void +fbCompositeSolidMask_nx8888x8888Csse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height); + +void +fbCompositeSrc_8888x8x8888sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height); + +void +fbCompositeSrc_x888xnx8888sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height); + +void +fbCompositeSrc_8888x8888sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height); + +void +fbCompositeSrc_8888x0565sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height); + +void +fbCompositeSolidMask_nx8x8888sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height); + +void +fbCompositeSolidMaskSrc_nx8x8888sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height); + +void +fbCompositeSolidMask_nx8x0565sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height); + +void +fbCompositeSrc_8888RevNPx0565sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height); + +void +fbCompositeSrc_8888RevNPx8888sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height); + +void +fbCompositeSolidMask_nx8888x0565Csse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height); + +void +fbCompositeIn_nx8x8sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height); + +void +fbCompositeIn_8x8sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height); + +void +fbCompositeSrcAdd_8888x8x8sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height); + + +void +fbCompositeSrcAdd_8000x8000sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height); + +void +fbCompositeSrcAdd_8888x8888sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height); + +void +fbCompositeCopyAreasse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height); + +void +fbCompositeOver_x888x8x8888sse2 (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height); + #endif /* USE_SSE2 */ #endif /* _PIXMAN_SSE_H_ */ -- 2.7.4