X-Git-Url: http://review.tizen.org/git/?a=blobdiff_plain;ds=sidebyside;f=pixman%2Fpixman-mmx.c;h=36cf2cd3e8ae69993c58c6c5b5a472908b8ea9a5;hb=084e3f2f4be900041cc35830359606addc1fc3be;hp=900f2e4ce974212bce2f4106aff1dd3e462ca04e;hpb=b7fe2f3378c6fb0828e863cb1a2df9191fb3e25e;p=profile%2Fivi%2Fpixman.git diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c index 900f2e4..36cf2cd 100644 --- a/pixman/pixman-mmx.c +++ b/pixman/pixman-mmx.c @@ -33,25 +33,71 @@ #include #endif -#ifdef USE_MMX +#if defined USE_X86_MMX || defined USE_ARM_IWMMXT #include +#include "pixman-private.h" +#include "pixman-combine32.h" -#include "pixman-mmx.h" - -#undef READ -#undef WRITE -#define READ(img,x) *(x) -#define WRITE(img,ptr,v) (*(ptr) = (v)); - -#define noVERBOSE +#define no_vERBOSE #ifdef VERBOSE -#define CHECKPOINT() ErrorF ("at %s %d\n", __FUNCTION__, __LINE__) +#define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__) #else #define CHECKPOINT() #endif +#ifdef USE_ARM_IWMMXT +/* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_empty (void) +{ + +} +#endif + +#ifdef USE_X86_MMX +# if (defined(__SUNPRO_C) || defined(_MSC_VER)) +# include +# else +/* We have to compile with -msse to use xmmintrin.h, but that causes SSE + * instructions to be generated that we don't want. Just duplicate the + * functions we want to use. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhi_pu16 (__m64 __A, __m64 __B) +{ + asm ("pmulhuw %1, %0\n\t" + : "+y" (__A) + : "y" (__B) + ); + return __A; +} + +# ifdef __OPTIMIZE__ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_pi16 (__m64 __A, int8_t const __N) +{ + __m64 ret; + + asm ("pshufw %2, %1, %0\n\t" + : "=y" (ret) + : "y" (__A), "K" (__N) + ); + + return ret; +} +# else +# define _mm_shuffle_pi16(A, N) \ + ((__m64) __builtin_ia32_pshufw ((__v4hi)(__m64)(A), (int)(N))) +# endif +# endif +#endif + +#ifndef _MSC_VER +#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ + (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) +#endif + /* Notes about writing mmx code * * give memory operands as the second operand. If you give it as the @@ -73,17 +119,38 @@ /* --------------- MMX primitives ------------------------------------- */ -#ifdef __GNUC__ +/* If __m64 is defined as a struct or union, then define M64_MEMBER to be + * the name of the member used to access the data. + * If __m64 requires using mm_cvt* intrinsics functions to convert between + * uint64_t and __m64 values, then define USE_CVT_INTRINSICS. + * If __m64 and uint64_t values can just be cast to each other directly, + * then define USE_M64_CASTS. + */ +#ifdef _MSC_VER +# define M64_MEMBER m64_u64 +#elif defined(__ICC) +# define USE_CVT_INTRINSICS +#elif defined(__GNUC__) +# define USE_M64_CASTS +#elif defined(__SUNPRO_C) +# if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__) +/* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__) + * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__ + * is defined. If it is used, then the mm_cvt* intrinsics must be used. + */ +# define USE_CVT_INTRINSICS +# else +/* For Studio 12.2 or older, or when __attribute__(__vector_size__) is + * disabled, __m64 is defined as a struct containing "unsigned long long l_". + */ +# define M64_MEMBER l_ +# endif +#endif + +#if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) typedef uint64_t mmxdatafield; #else typedef __m64 mmxdatafield; -/* If __m64 is defined as a struct or union, define M64_MEMBER to be the - name of the member used to access the data */ -# ifdef _MSC_VER -# define M64_MEMBER m64_u64 -# elif defined(__SUNPRO_C) -# define M64_MEMBER l_ -# endif #endif typedef struct @@ -100,78 +167,73 @@ typedef struct mmxdatafield mmx_mask_2; mmxdatafield mmx_mask_3; mmxdatafield mmx_full_alpha; - mmxdatafield mmx_ffff0000ffff0000; - mmxdatafield mmx_0000ffff00000000; - mmxdatafield mmx_000000000000ffff; -} MMXData; + mmxdatafield mmx_4x0101; +} mmx_data_t; #if defined(_MSC_VER) -# define MMXDATA_INIT(field, val) { val##UI64 } -#elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */ -# define MMXDATA_INIT(field, val) field = { val##ULL } -#else /* __m64 is an integral type */ -# define MMXDATA_INIT(field, val) field = val##ULL +# define MMXDATA_INIT(field, val) { val ## UI64 } +#elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */ +# define MMXDATA_INIT(field, val) field = { val ## ULL } +#else /* mmxdatafield is an integral type */ +# define MMXDATA_INIT(field, val) field = val ## ULL #endif -static const MMXData c = -{ - MMXDATA_INIT(.mmx_4x00ff, 0x00ff00ff00ff00ff), - MMXDATA_INIT(.mmx_4x0080, 0x0080008000800080), - MMXDATA_INIT(.mmx_565_rgb, 0x000001f0003f001f), - MMXDATA_INIT(.mmx_565_unpack_multiplier, 0x0000008404100840), - MMXDATA_INIT(.mmx_565_r, 0x000000f800000000), - MMXDATA_INIT(.mmx_565_g, 0x0000000000fc0000), - MMXDATA_INIT(.mmx_565_b, 0x00000000000000f8), - MMXDATA_INIT(.mmx_mask_0, 0xffffffffffff0000), - MMXDATA_INIT(.mmx_mask_1, 0xffffffff0000ffff), - MMXDATA_INIT(.mmx_mask_2, 0xffff0000ffffffff), - MMXDATA_INIT(.mmx_mask_3, 0x0000ffffffffffff), - MMXDATA_INIT(.mmx_full_alpha, 0x00ff000000000000), - MMXDATA_INIT(.mmx_ffff0000ffff0000, 0xffff0000ffff0000), - MMXDATA_INIT(.mmx_0000ffff00000000, 0x0000ffff00000000), - MMXDATA_INIT(.mmx_000000000000ffff, 0x000000000000ffff), +static const mmx_data_t c = +{ + MMXDATA_INIT (.mmx_4x00ff, 0x00ff00ff00ff00ff), + MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080), + MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f), + MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840), + MMXDATA_INIT (.mmx_565_r, 0x000000f800000000), + MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000), + MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8), + MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000), + MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff), + MMXDATA_INIT (.mmx_mask_2, 0xffff0000ffffffff), + MMXDATA_INIT (.mmx_mask_3, 0x0000ffffffffffff), + MMXDATA_INIT (.mmx_full_alpha, 0x00ff000000000000), + MMXDATA_INIT (.mmx_4x0101, 0x0101010101010101), }; -#ifdef __GNUC__ -# ifdef __ICC -# define MC(x) M64(c.mmx_##x) -# else -# define MC(x) ((__m64)c.mmx_##x) -# endif +#ifdef USE_CVT_INTRINSICS +# define MC(x) to_m64 (c.mmx_ ## x) +#elif defined(USE_M64_CASTS) +# define MC(x) ((__m64)c.mmx_ ## x) #else -# define MC(x) c.mmx_##x +# define MC(x) c.mmx_ ## x #endif static force_inline __m64 -M64 (uint64_t x) +to_m64 (uint64_t x) { -#ifdef __ICC +#ifdef USE_CVT_INTRINSICS return _mm_cvtsi64_m64 (x); -#elif defined M64_MEMBER /* __m64 is a struct, not an integral type */ +#elif defined M64_MEMBER /* __m64 is a struct, not an integral type */ __m64 res; res.M64_MEMBER = x; return res; -#else /* __m64 is an integral type */ +#else /* USE_M64_CASTS */ return (__m64)x; #endif } static force_inline uint64_t -UINT64 (__m64 x) +to_uint64 (__m64 x) { -#ifdef __ICC +#ifdef USE_CVT_INTRINSICS return _mm_cvtm64_si64 (x); -#elif defined M64_MEMBER /* __m64 is a struct, not an integral type */ +#elif defined M64_MEMBER /* __m64 is a struct, not an integral type */ uint64_t res = x.M64_MEMBER; return res; -#else /* __m64 is an integral type */ +#else /* USE_M64_CASTS */ return (uint64_t)x; #endif } static force_inline __m64 -shift (__m64 v, int s) +shift (__m64 v, + int s) { if (s > 0) return _mm_slli_si64 (v, s); @@ -184,7 +246,7 @@ shift (__m64 v, int s) static force_inline __m64 negate (__m64 mask) { - return _mm_xor_si64 (mask, MC(4x00ff)); + return _mm_xor_si64 (mask, MC (4x00ff)); } static force_inline __m64 @@ -193,9 +255,8 @@ pix_multiply (__m64 a, __m64 b) __m64 res; res = _mm_mullo_pi16 (a, b); - res = _mm_adds_pu16 (res, MC(4x0080)); - res = _mm_adds_pu16 (res, _mm_srli_pi16 (res, 8)); - res = _mm_srli_pi16 (res, 8); + res = _mm_adds_pu16 (res, MC (4x0080)); + res = _mm_mulhi_pu16 (res, MC (4x0101)); return res; } @@ -203,107 +264,108 @@ pix_multiply (__m64 a, __m64 b) static force_inline __m64 pix_add (__m64 a, __m64 b) { - return _mm_adds_pu8 (a, b); + return _mm_adds_pu8 (a, b); } static force_inline __m64 expand_alpha (__m64 pixel) { - __m64 t1, t2; - - t1 = shift (pixel, -48); - t2 = shift (t1, 16); - t1 = _mm_or_si64 (t1, t2); - t2 = shift (t1, 32); - t1 = _mm_or_si64 (t1, t2); - - return t1; + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3)); } static force_inline __m64 expand_alpha_rev (__m64 pixel) { - __m64 t1, t2; - - /* move alpha to low 16 bits and zero the rest */ - t1 = shift (pixel, 48); - t1 = shift (t1, -48); - - t2 = shift (t1, 16); - t1 = _mm_or_si64 (t1, t2); - t2 = shift (t1, 32); - t1 = _mm_or_si64 (t1, t2); - - return t1; + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0)); } static force_inline __m64 invert_colors (__m64 pixel) { - __m64 x, y, z; - - x = y = z = pixel; - - x = _mm_and_si64 (x, MC(ffff0000ffff0000)); - y = _mm_and_si64 (y, MC(000000000000ffff)); - z = _mm_and_si64 (z, MC(0000ffff00000000)); - - y = shift (y, 32); - z = shift (z, -32); - - x = _mm_or_si64 (x, y); - x = _mm_or_si64 (x, z); - - return x; + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2)); } static force_inline __m64 -over (__m64 src, __m64 srca, __m64 dest) +over (__m64 src, + __m64 srca, + __m64 dest) { - return _mm_adds_pu8 (src, pix_multiply(dest, negate(srca))); + return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca))); } static force_inline __m64 over_rev_non_pre (__m64 src, __m64 dest) { __m64 srca = expand_alpha (src); - __m64 srcfaaa = _mm_or_si64 (srca, MC(full_alpha)); + __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha)); - return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest); + return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest); } static force_inline __m64 -in (__m64 src, - __m64 mask) +in (__m64 src, __m64 mask) { return pix_multiply (src, mask); } +#ifndef _MSC_VER static force_inline __m64 -in_over_full_src_alpha (__m64 src, __m64 mask, __m64 dest) +in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest) { - src = _mm_or_si64 (src, MC(full_alpha)); - - return over(in (src, mask), mask, dest); + return over (in (src, mask), pix_multiply (srca, mask), dest); } -#ifndef _MSC_VER -static force_inline __m64 -in_over (__m64 src, - __m64 srca, - __m64 mask, - __m64 dest) +#else + +#define in_over(src, srca, mask, dest) \ + over (in (src, mask), pix_multiply (srca, mask), dest) + +#endif + +/* Elemental unaligned loads */ + +static force_inline __m64 ldq_u(uint64_t *p) { - return over(in(src, mask), pix_multiply(srca, mask), dest); +#ifdef USE_X86_MMX + /* x86's alignment restrictions are very relaxed. */ + return *(__m64 *)p; +#elif defined USE_ARM_IWMMXT + int align = (uintptr_t)p & 7; + __m64 *aligned_p; + if (align == 0) + return *p; + aligned_p = (__m64 *)((uintptr_t)p & ~7); + return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align); +#else + struct __una_u64 { uint64_t x __attribute__((packed)); }; + const struct __una_u64 *ptr = (const struct __una_u64 *) p; + return (__m64) ptr->x; +#endif } + +static force_inline uint32_t ldl_u(const uint32_t *p) +{ +#ifdef USE_X86_MMX + /* x86's alignment restrictions are very relaxed. */ + return *p; #else -#define in_over(src, srca, mask, dest) over(in(src, mask), pix_multiply(srca, mask), dest) + struct __una_u32 { uint32_t x __attribute__((packed)); }; + const struct __una_u32 *ptr = (const struct __una_u32 *) p; + return ptr->x; #endif +} static force_inline __m64 -load8888 (uint32_t v) +load8888 (const uint32_t *v) { - return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64()); + return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (*v), _mm_setzero_si64 ()); +} + +static force_inline __m64 +load8888u (const uint32_t *v) +{ + uint32_t l = ldl_u(v); + return load8888(&l); } static force_inline __m64 @@ -312,10 +374,17 @@ pack8888 (__m64 lo, __m64 hi) return _mm_packs_pu16 (lo, hi); } -static force_inline uint32_t -store8888 (__m64 v) +static force_inline void +store (uint32_t *dest, __m64 v) +{ + *dest = _mm_cvtsi64_si32 (v); +} + +static force_inline void +store8888 (uint32_t *dest, __m64 v) { - return _mm_cvtsi64_si32(pack8888(v, _mm_setzero_si64())); + v = pack8888 (v, _mm_setzero_si64()); + store (dest, v); } /* Expand 16 bits positioned at @pos (0-3) of a mmx register into @@ -346,9 +415,9 @@ expand565 (__m64 pixel, int pos) p = _mm_or_si64 (t1, p); p = _mm_or_si64 (t2, p); - p = _mm_and_si64 (p, MC(565_rgb)); + p = _mm_and_si64 (p, MC (565_rgb)); - pixel = _mm_mullo_pi16 (p, MC(565_unpack_multiplier)); + pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier)); return _mm_srli_pi16 (pixel, 8); } @@ -356,40 +425,40 @@ static force_inline __m64 expand8888 (__m64 in, int pos) { if (pos == 0) - return _mm_unpacklo_pi8 (in, _mm_setzero_si64()); + return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ()); else - return _mm_unpackhi_pi8 (in, _mm_setzero_si64()); + return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ()); } static force_inline __m64 expandx888 (__m64 in, int pos) { - return _mm_or_si64 (expand8888 (in, pos), MC(full_alpha)); + return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha)); } static force_inline __m64 -pack565 (__m64 pixel, __m64 target, int pos) +pack_565 (__m64 pixel, __m64 target, int pos) { __m64 p = pixel; __m64 t = target; __m64 r, g, b; - r = _mm_and_si64 (p, MC(565_r)); - g = _mm_and_si64 (p, MC(565_g)); - b = _mm_and_si64 (p, MC(565_b)); + r = _mm_and_si64 (p, MC (565_r)); + g = _mm_and_si64 (p, MC (565_g)); + b = _mm_and_si64 (p, MC (565_b)); - r = shift (r, - (32 - 8) + pos * 16); - g = shift (g, - (16 - 3) + pos * 16); - b = shift (b, - (0 + 3) + pos * 16); + r = shift (r, -(32 - 8) + pos * 16); + g = shift (g, -(16 - 3) + pos * 16); + b = shift (b, -(0 + 3) + pos * 16); if (pos == 0) - t = _mm_and_si64 (t, MC(mask_0)); + t = _mm_and_si64 (t, MC (mask_0)); else if (pos == 1) - t = _mm_and_si64 (t, MC(mask_1)); + t = _mm_and_si64 (t, MC (mask_1)); else if (pos == 2) - t = _mm_and_si64 (t, MC(mask_2)); + t = _mm_and_si64 (t, MC (mask_2)); else if (pos == 3) - t = _mm_and_si64 (t, MC(mask_3)); + t = _mm_and_si64 (t, MC (mask_3)); p = _mm_or_si64 (r, t); p = _mm_or_si64 (g, p); @@ -398,52 +467,27 @@ pack565 (__m64 pixel, __m64 target, int pos) } #ifndef _MSC_VER + static force_inline __m64 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b) { - x = _mm_mullo_pi16 (x, a); - y = _mm_mullo_pi16 (y, b); - x = _mm_adds_pu16 (x, MC(4x0080)); - x = _mm_adds_pu16 (x, y); - x = _mm_adds_pu16 (x, _mm_srli_pi16 (x, 8)); - x = _mm_srli_pi16 (x, 8); + x = pix_multiply (x, a); + y = pix_multiply (y, b); - return x; + return pix_add (x, y); } + #else -#define pix_add_mul(x, a, y, b) \ -( x = _mm_mullo_pi16 (x, a), \ - y = _mm_mullo_pi16 (y, b), \ - x = _mm_adds_pu16 (x, MC(4x0080)), \ - x = _mm_adds_pu16 (x, y), \ - x = _mm_adds_pu16 (x, _mm_srli_pi16 (x, 8)), \ - _mm_srli_pi16 (x, 8) ) + +#define pix_add_mul(x, a, y, b) \ + ( x = pix_multiply (x, a), \ + y = pix_multiply (y, b), \ + pix_add (x, y) ) + #endif /* --------------- MMX code patch for fbcompose.c --------------------- */ -static FASTCALL void -mmxCombineMaskU (uint32_t *src, const uint32_t *mask, int width) -{ - const uint32_t *end = mask + width; - while (mask < end) { - uint32_t mmask = *mask; - uint32_t maska = mmask >> 24; - if (maska == 0) { - *src = 0; - } else if (maska != 0xff) { - __m64 a = load8888(mmask); - __m64 s = load8888(*src); - a = expand_alpha(a); - s = pix_multiply(s, a); - *src = store8888(s); - } - ++src; - ++mask; - } - _mm_empty(); -} - static force_inline uint32_t combine (const uint32_t *src, const uint32_t *mask) { @@ -451,565 +495,710 @@ combine (const uint32_t *src, const uint32_t *mask) if (mask) { - __m64 m = load8888 (*mask); - __m64 s = load8888 (ssrc); + __m64 m = load8888 (mask); + __m64 s = load8888 (&ssrc); m = expand_alpha (m); s = pix_multiply (s, m); - ssrc = store8888 (s); + store8888 (&ssrc, s); } return ssrc; } -static FASTCALL void -mmxCombineOverU (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width) +static void +mmx_combine_over_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) { const uint32_t *end = dest + width; - while (dest < end) { + while (dest < end) + { uint32_t ssrc = combine (src, mask); uint32_t a = ssrc >> 24; - if (a == 0xff) { + + if (a == 0xff) + { *dest = ssrc; - } else if (a) { + } + else if (ssrc) + { __m64 s, sa; - s = load8888(ssrc); - sa = expand_alpha(s); - *dest = store8888(over(s, sa, load8888(*dest))); + s = load8888 (&ssrc); + sa = expand_alpha (s); + store8888 (dest, over (s, sa, load8888 (dest))); } + ++dest; ++src; if (mask) ++mask; } - _mm_empty(); + _mm_empty (); } -static FASTCALL void -mmxCombineOverReverseU (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width) +static void +mmx_combine_over_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) { const uint32_t *end = dest + width; - while (dest < end) { + while (dest < end) + { __m64 d, da; uint32_t s = combine (src, mask); - d = load8888(*dest); - da = expand_alpha(d); - *dest = store8888(over (d, da, load8888(s))); - ++dest; - ++src; + + d = load8888 (dest); + da = expand_alpha (d); + store8888 (dest, over (d, da, load8888 (&s))); + + ++dest; + ++src; if (mask) mask++; } - _mm_empty(); + _mm_empty (); } -static FASTCALL void -mmxCombineInU (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width) +static void +mmx_combine_in_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) { const uint32_t *end = dest + width; - while (dest < end) { - __m64 x, a; - x = load8888 (combine (src, mask)); - a = load8888(*dest); - a = expand_alpha(a); - x = pix_multiply(x, a); - *dest = store8888(x); - ++dest; - ++src; + while (dest < end) + { + __m64 x, a; + uint32_t ssrc = combine (src, mask); + + x = load8888 (&ssrc); + a = load8888 (dest); + a = expand_alpha (a); + x = pix_multiply (x, a); + + store8888 (dest, x); + + ++dest; + ++src; if (mask) mask++; } - _mm_empty(); + _mm_empty (); } -static FASTCALL void -mmxCombineInReverseU (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width) +static void +mmx_combine_in_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) { const uint32_t *end = dest + width; - while (dest < end) { - __m64 x, a; - x = load8888(*dest); - a = load8888(combine (src, mask)); - a = expand_alpha(a); - x = pix_multiply(x, a); - *dest = store8888(x); - ++dest; - ++src; + while (dest < end) + { + __m64 x, a; + uint32_t ssrc = combine (src, mask); + + x = load8888 (dest); + a = load8888 (&ssrc); + a = expand_alpha (a); + x = pix_multiply (x, a); + store8888 (dest, x); + + ++dest; + ++src; if (mask) mask++; } - _mm_empty(); + _mm_empty (); } -static FASTCALL void -mmxCombineOutU (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width) +static void +mmx_combine_out_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) { const uint32_t *end = dest + width; - while (dest < end) { - __m64 x, a; - x = load8888(combine (src, mask)); - a = load8888(*dest); - a = expand_alpha(a); - a = negate(a); - x = pix_multiply(x, a); - *dest = store8888(x); - ++dest; - ++src; + while (dest < end) + { + __m64 x, a; + uint32_t ssrc = combine (src, mask); + + x = load8888 (&ssrc); + a = load8888 (dest); + a = expand_alpha (a); + a = negate (a); + x = pix_multiply (x, a); + store8888 (dest, x); + + ++dest; + ++src; if (mask) mask++; } - _mm_empty(); + _mm_empty (); } -static FASTCALL void -mmxCombineOutReverseU (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width) +static void +mmx_combine_out_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) { const uint32_t *end = dest + width; - while (dest < end) { - __m64 x, a; - x = load8888(*dest); - a = load8888(combine (src, mask)); - a = expand_alpha(a); - a = negate(a); - x = pix_multiply(x, a); - *dest = store8888(x); - ++dest; - ++src; + while (dest < end) + { + __m64 x, a; + uint32_t ssrc = combine (src, mask); + + x = load8888 (dest); + a = load8888 (&ssrc); + a = expand_alpha (a); + a = negate (a); + x = pix_multiply (x, a); + + store8888 (dest, x); + + ++dest; + ++src; if (mask) mask++; } - _mm_empty(); + _mm_empty (); } -static FASTCALL void -mmxCombineAtopU (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width) +static void +mmx_combine_atop_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) { const uint32_t *end = dest + width; - while (dest < end) { - __m64 s, da, d, sia; - s = load8888(combine (src, mask)); - d = load8888(*dest); - sia = expand_alpha(s); - sia = negate(sia); - da = expand_alpha(d); - s = pix_add_mul (s, da, d, sia); - *dest = store8888(s); - ++dest; - ++src; + while (dest < end) + { + __m64 s, da, d, sia; + uint32_t ssrc = combine (src, mask); + + s = load8888 (&ssrc); + d = load8888 (dest); + sia = expand_alpha (s); + sia = negate (sia); + da = expand_alpha (d); + s = pix_add_mul (s, da, d, sia); + store8888 (dest, s); + + ++dest; + ++src; if (mask) mask++; } - _mm_empty(); + _mm_empty (); } -static FASTCALL void -mmxCombineAtopReverseU (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width) +static void +mmx_combine_atop_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) { const uint32_t *end; end = dest + width; - while (dest < end) { - __m64 s, dia, d, sa; - s = load8888(combine(src, mask)); - d = load8888(*dest); - sa = expand_alpha(s); - dia = expand_alpha(d); - dia = negate(dia); + while (dest < end) + { + __m64 s, dia, d, sa; + uint32_t ssrc = combine (src, mask); + + s = load8888 (&ssrc); + d = load8888 (dest); + sa = expand_alpha (s); + dia = expand_alpha (d); + dia = negate (dia); s = pix_add_mul (s, dia, d, sa); - *dest = store8888(s); - ++dest; - ++src; + store8888 (dest, s); + + ++dest; + ++src; if (mask) mask++; } - _mm_empty(); + _mm_empty (); } -static FASTCALL void -mmxCombineXorU (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width) +static void +mmx_combine_xor_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) { const uint32_t *end = dest + width; - while (dest < end) { - __m64 s, dia, d, sia; - s = load8888(combine(src, mask)); - d = load8888(*dest); - sia = expand_alpha(s); - dia = expand_alpha(d); - sia = negate(sia); - dia = negate(dia); + while (dest < end) + { + __m64 s, dia, d, sia; + uint32_t ssrc = combine (src, mask); + + s = load8888 (&ssrc); + d = load8888 (dest); + sia = expand_alpha (s); + dia = expand_alpha (d); + sia = negate (sia); + dia = negate (dia); s = pix_add_mul (s, dia, d, sia); - *dest = store8888(s); - ++dest; - ++src; + store8888 (dest, s); + + ++dest; + ++src; if (mask) mask++; } - _mm_empty(); + _mm_empty (); } -static FASTCALL void -mmxCombineAddU (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width) +static void +mmx_combine_add_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) { const uint32_t *end = dest + width; - while (dest < end) { - __m64 s, d; - s = load8888(combine(src,mask)); - d = load8888(*dest); - s = pix_add(s, d); - *dest = store8888(s); - ++dest; - ++src; + + while (dest < end) + { + __m64 s, d; + uint32_t ssrc = combine (src, mask); + + s = load8888 (&ssrc); + d = load8888 (dest); + s = pix_add (s, d); + store8888 (dest, s); + + ++dest; + ++src; if (mask) mask++; } - _mm_empty(); + _mm_empty (); } -static FASTCALL void -mmxCombineSaturateU (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width) +static void +mmx_combine_saturate_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) { const uint32_t *end = dest + width; - while (dest < end) { - uint32_t s = combine(src,mask); - uint32_t d = *dest; - __m64 ms = load8888(s); - __m64 md = load8888(d); - uint32_t sa = s >> 24; - uint32_t da = ~d >> 24; - - if (sa > da) { - __m64 msa = load8888(FbIntDiv(da, sa) << 24); - msa = expand_alpha(msa); - ms = pix_multiply(ms, msa); - } - md = pix_add(md, ms); - *dest = store8888(md); - ++src; - ++dest; + + while (dest < end) + { + uint32_t s = combine (src, mask); + uint32_t d = *dest; + __m64 ms = load8888 (&s); + __m64 md = load8888 (&d); + uint32_t sa = s >> 24; + uint32_t da = ~d >> 24; + + if (sa > da) + { + uint32_t quot = DIV_UN8 (da, sa) << 24; + __m64 msa = load8888 ("); + msa = expand_alpha (msa); + ms = pix_multiply (ms, msa); + } + + md = pix_add (md, ms); + store8888 (dest, md); + + ++src; + ++dest; if (mask) mask++; } - _mm_empty(); + _mm_empty (); } - -static FASTCALL void -mmxCombineSrcC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width) +static void +mmx_combine_src_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) { const uint32_t *end = src + width; - while (src < end) { - __m64 a = load8888(*mask); - __m64 s = load8888(*src); - s = pix_multiply(s, a); - *dest = store8888(s); - ++src; - ++mask; - ++dest; + + while (src < end) + { + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + + s = pix_multiply (s, a); + store8888 (dest, s); + + ++src; + ++mask; + ++dest; } - _mm_empty(); + _mm_empty (); } -static FASTCALL void -mmxCombineOverC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width) +static void +mmx_combine_over_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) { const uint32_t *end = src + width; - while (src < end) { - __m64 a = load8888(*mask); - __m64 s = load8888(*src); - __m64 d = load8888(*dest); - __m64 sa = expand_alpha(s); - *dest = store8888(in_over (s, sa, a, d)); + while (src < end) + { + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); + __m64 sa = expand_alpha (s); + + store8888 (dest, in_over (s, sa, a, d)); - ++src; - ++dest; - ++mask; + ++src; + ++dest; + ++mask; } - _mm_empty(); + _mm_empty (); } -static FASTCALL void -mmxCombineOverReverseC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width) +static void +mmx_combine_over_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) { const uint32_t *end = src + width; - while (src < end) { - __m64 a = load8888(*mask); - __m64 s = load8888(*src); - __m64 d = load8888(*dest); - __m64 da = expand_alpha(d); - *dest = store8888(over (d, da, in (s, a))); + while (src < end) + { + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); + __m64 da = expand_alpha (d); - ++src; - ++dest; - ++mask; + store8888 (dest, over (d, da, in (s, a))); + + ++src; + ++dest; + ++mask; } - _mm_empty(); + _mm_empty (); } - -static FASTCALL void -mmxCombineInC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width) +static void +mmx_combine_in_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) { const uint32_t *end = src + width; - while (src < end) { - __m64 a = load8888(*mask); - __m64 s = load8888(*src); - __m64 d = load8888(*dest); - __m64 da = expand_alpha(d); - s = pix_multiply(s, a); - s = pix_multiply(s, da); - *dest = store8888(s); - ++src; - ++dest; - ++mask; + + while (src < end) + { + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); + __m64 da = expand_alpha (d); + + s = pix_multiply (s, a); + s = pix_multiply (s, da); + store8888 (dest, s); + + ++src; + ++dest; + ++mask; } - _mm_empty(); + _mm_empty (); } -static FASTCALL void -mmxCombineInReverseC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width) +static void +mmx_combine_in_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) { const uint32_t *end = src + width; - while (src < end) { - __m64 a = load8888(*mask); - __m64 s = load8888(*src); - __m64 d = load8888(*dest); - __m64 sa = expand_alpha(s); - a = pix_multiply(a, sa); - d = pix_multiply(d, a); - *dest = store8888(d); - ++src; - ++dest; - ++mask; + + while (src < end) + { + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); + __m64 sa = expand_alpha (s); + + a = pix_multiply (a, sa); + d = pix_multiply (d, a); + store8888 (dest, d); + + ++src; + ++dest; + ++mask; } - _mm_empty(); + _mm_empty (); } -static FASTCALL void -mmxCombineOutC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width) +static void +mmx_combine_out_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) { const uint32_t *end = src + width; - while (src < end) { - __m64 a = load8888(*mask); - __m64 s = load8888(*src); - __m64 d = load8888(*dest); - __m64 da = expand_alpha(d); - da = negate(da); - s = pix_multiply(s, a); - s = pix_multiply(s, da); - *dest = store8888(s); - ++src; - ++dest; - ++mask; + + while (src < end) + { + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); + __m64 da = expand_alpha (d); + + da = negate (da); + s = pix_multiply (s, a); + s = pix_multiply (s, da); + store8888 (dest, s); + + ++src; + ++dest; + ++mask; } - _mm_empty(); + _mm_empty (); } -static FASTCALL void -mmxCombineOutReverseC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width) +static void +mmx_combine_out_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) { const uint32_t *end = src + width; - while (src < end) { - __m64 a = load8888(*mask); - __m64 s = load8888(*src); - __m64 d = load8888(*dest); - __m64 sa = expand_alpha(s); - a = pix_multiply(a, sa); - a = negate(a); - d = pix_multiply(d, a); - *dest = store8888(d); - ++src; - ++dest; - ++mask; + + while (src < end) + { + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); + __m64 sa = expand_alpha (s); + + a = pix_multiply (a, sa); + a = negate (a); + d = pix_multiply (d, a); + store8888 (dest, d); + + ++src; + ++dest; + ++mask; } - _mm_empty(); + _mm_empty (); } -static FASTCALL void -mmxCombineAtopC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width) +static void +mmx_combine_atop_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) { const uint32_t *end = src + width; - while (src < end) { - __m64 a = load8888(*mask); - __m64 s = load8888(*src); - __m64 d = load8888(*dest); - __m64 da = expand_alpha(d); - __m64 sa = expand_alpha(s); - s = pix_multiply(s, a); - a = pix_multiply(a, sa); - a = negate(a); + + while (src < end) + { + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); + __m64 da = expand_alpha (d); + __m64 sa = expand_alpha (s); + + s = pix_multiply (s, a); + a = pix_multiply (a, sa); + a = negate (a); d = pix_add_mul (d, a, s, da); - *dest = store8888(d); - ++src; - ++dest; - ++mask; + store8888 (dest, d); + + ++src; + ++dest; + ++mask; } - _mm_empty(); + _mm_empty (); } -static FASTCALL void -mmxCombineAtopReverseC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width) +static void +mmx_combine_atop_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) { const uint32_t *end = src + width; - while (src < end) { - __m64 a = load8888(*mask); - __m64 s = load8888(*src); - __m64 d = load8888(*dest); - __m64 da = expand_alpha(d); - __m64 sa = expand_alpha(s); - s = pix_multiply(s, a); - a = pix_multiply(a, sa); - da = negate(da); + + while (src < end) + { + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); + __m64 da = expand_alpha (d); + __m64 sa = expand_alpha (s); + + s = pix_multiply (s, a); + a = pix_multiply (a, sa); + da = negate (da); d = pix_add_mul (d, a, s, da); - *dest = store8888(d); - ++src; - ++dest; - ++mask; + store8888 (dest, d); + + ++src; + ++dest; + ++mask; } - _mm_empty(); + _mm_empty (); } -static FASTCALL void -mmxCombineXorC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width) +static void +mmx_combine_xor_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) { const uint32_t *end = src + width; - while (src < end) { - __m64 a = load8888(*mask); - __m64 s = load8888(*src); - __m64 d = load8888(*dest); - __m64 da = expand_alpha(d); - __m64 sa = expand_alpha(s); - s = pix_multiply(s, a); - a = pix_multiply(a, sa); - da = negate(da); - a = negate(a); + + while (src < end) + { + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); + __m64 da = expand_alpha (d); + __m64 sa = expand_alpha (s); + + s = pix_multiply (s, a); + a = pix_multiply (a, sa); + da = negate (da); + a = negate (a); d = pix_add_mul (d, a, s, da); - *dest = store8888(d); - ++src; - ++dest; - ++mask; - } - _mm_empty(); -} + store8888 (dest, d); -static FASTCALL void -mmxCombineAddC (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width) -{ - const uint32_t *end = src + width; - while (src < end) { - __m64 a = load8888(*mask); - __m64 s = load8888(*src); - __m64 d = load8888(*dest); - s = pix_multiply(s, a); - d = pix_add(s, d); - *dest = store8888(d); - ++src; - ++dest; - ++mask; + ++src; + ++dest; + ++mask; } - _mm_empty(); + _mm_empty (); } -void -fbComposeSetupMMX(void) +static void +mmx_combine_add_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) { - static pixman_bool_t initialized = FALSE; + const uint32_t *end = src + width; - if (initialized) - return; - - /* check if we have MMX support and initialize accordingly */ - if (pixman_have_mmx()) + while (src < end) { - pixman_composeFunctions.combineU[PIXMAN_OP_OVER] = mmxCombineOverU; - pixman_composeFunctions.combineU[PIXMAN_OP_OVER_REVERSE] = mmxCombineOverReverseU; - pixman_composeFunctions.combineU[PIXMAN_OP_IN] = mmxCombineInU; - pixman_composeFunctions.combineU[PIXMAN_OP_IN_REVERSE] = mmxCombineInReverseU; - pixman_composeFunctions.combineU[PIXMAN_OP_OUT] = mmxCombineOutU; - pixman_composeFunctions.combineU[PIXMAN_OP_OUT_REVERSE] = mmxCombineOutReverseU; - pixman_composeFunctions.combineU[PIXMAN_OP_ATOP] = mmxCombineAtopU; - pixman_composeFunctions.combineU[PIXMAN_OP_ATOP_REVERSE] = mmxCombineAtopReverseU; -#if 0 - pixman_composeFunctions.combineU[PIXMAN_OP_XOR] = mmxCombineXorU; - pixman_composeFunctions.combineU[PIXMAN_OP_ADD] = mmxCombineAddU; - pixman_composeFunctions.combineU[PIXMAN_OP_SATURATE] = mmxCombineSaturateU; -#endif + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); - pixman_composeFunctions.combineC[PIXMAN_OP_SRC] = mmxCombineSrcC; - pixman_composeFunctions.combineC[PIXMAN_OP_OVER] = mmxCombineOverC; - pixman_composeFunctions.combineC[PIXMAN_OP_OVER_REVERSE] = mmxCombineOverReverseC; - pixman_composeFunctions.combineC[PIXMAN_OP_IN] = mmxCombineInC; - pixman_composeFunctions.combineC[PIXMAN_OP_IN_REVERSE] = mmxCombineInReverseC; - pixman_composeFunctions.combineC[PIXMAN_OP_OUT] = mmxCombineOutC; - pixman_composeFunctions.combineC[PIXMAN_OP_OUT_REVERSE] = mmxCombineOutReverseC; - pixman_composeFunctions.combineC[PIXMAN_OP_ATOP] = mmxCombineAtopC; - pixman_composeFunctions.combineC[PIXMAN_OP_ATOP_REVERSE] = mmxCombineAtopReverseC; - pixman_composeFunctions.combineC[PIXMAN_OP_XOR] = mmxCombineXorC; - pixman_composeFunctions.combineC[PIXMAN_OP_ADD] = mmxCombineAddC; - - pixman_composeFunctions.combineMaskU = mmxCombineMaskU; - } + s = pix_multiply (s, a); + d = pix_add (s, d); + store8888 (dest, d); - initialized = TRUE; + ++src; + ++dest; + ++mask; + } + _mm_empty (); } +/* ------------- MMX code paths called from fbpict.c -------------------- */ -/* ------------------ MMX code paths called from fbpict.c ----------------------- */ - -void -fbCompositeSolid_nx8888mmx (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) +static void +mmx_composite_over_n_8888 (pixman_implementation_t *imp, + pixman_composite_info_t *info) { - uint32_t src; - uint32_t *dstLine, *dst; - uint16_t w; - int dstStride; - __m64 vsrc, vsrca; + PIXMAN_COMPOSITE_ARGS (info); + uint32_t src; + uint32_t *dst_line, *dst; + int32_t w; + int dst_stride; + __m64 vsrc, vsrca; - CHECKPOINT(); + CHECKPOINT (); - fbComposeGetSolid(pSrc, src, pDst->bits.format); + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); - if (src >> 24 == 0) + if (src == 0) return; - fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); - vsrc = load8888 (src); + vsrc = load8888 (&src); vsrca = expand_alpha (vsrc); while (height--) { - dst = dstLine; - dstLine += dstStride; + dst = dst_line; + dst_line += dst_stride; w = width; - CHECKPOINT(); + CHECKPOINT (); while (w && (unsigned long)dst & 7) { - *dst = store8888(over(vsrc, vsrca, load8888(*dst))); + store8888 (dst, over (vsrc, vsrca, load8888 (dst))); w--; dst++; @@ -1022,75 +1211,64 @@ fbCompositeSolid_nx8888mmx (pixman_op_t op, vdest = *(__m64 *)dst; - dest0 = over(vsrc, vsrca, expand8888(vdest, 0)); - dest1 = over(vsrc, vsrca, expand8888(vdest, 1)); + dest0 = over (vsrc, vsrca, expand8888 (vdest, 0)); + dest1 = over (vsrc, vsrca, expand8888 (vdest, 1)); - *(__m64 *)dst = pack8888(dest0, dest1); + *(__m64 *)dst = pack8888 (dest0, dest1); dst += 2; w -= 2; } - CHECKPOINT(); + CHECKPOINT (); - while (w) + if (w) { - *dst = store8888(over(vsrc, vsrca, load8888(*dst))); - - w--; - dst++; + store8888 (dst, over (vsrc, vsrca, load8888 (dst))); } } - _mm_empty(); + _mm_empty (); } -void -fbCompositeSolid_nx0565mmx (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) +static void +mmx_composite_over_n_0565 (pixman_implementation_t *imp, + pixman_composite_info_t *info) { - uint32_t src; - uint16_t *dstLine, *dst; - uint16_t w; - int dstStride; - __m64 vsrc, vsrca; + PIXMAN_COMPOSITE_ARGS (info); + uint32_t src; + uint16_t *dst_line, *dst; + int32_t w; + int dst_stride; + __m64 vsrc, vsrca; - CHECKPOINT(); + CHECKPOINT (); - fbComposeGetSolid(pSrc, src, pDst->bits.format); + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); - if (src >> 24 == 0) + if (src == 0) return; - fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); - vsrc = load8888 (src); + vsrc = load8888 (&src); vsrca = expand_alpha (vsrc); while (height--) { - dst = dstLine; - dstLine += dstStride; + dst = dst_line; + dst_line += dst_stride; w = width; - CHECKPOINT(); + CHECKPOINT (); while (w && (unsigned long)dst & 7) { uint64_t d = *dst; - __m64 vdest = expand565 (M64(d), 0); - vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0); - *dst = UINT64(vdest); + __m64 vdest = expand565 (to_m64 (d), 0); + + vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0); + *dst = to_uint64 (vdest); w--; dst++; @@ -1102,10 +1280,10 @@ fbCompositeSolid_nx0565mmx (pixman_op_t op, vdest = *(__m64 *)dst; - vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0); - vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1); - vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2); - vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3); + vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 0)), vdest, 0); + vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 1)), vdest, 1); + vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 2)), vdest, 2); + vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 3)), vdest, 3); *(__m64 *)dst = vdest; @@ -1113,62 +1291,53 @@ fbCompositeSolid_nx0565mmx (pixman_op_t op, w -= 4; } - CHECKPOINT(); + CHECKPOINT (); while (w) { uint64_t d = *dst; - __m64 vdest = expand565 (M64(d), 0); - vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0); - *dst = UINT64(vdest); + __m64 vdest = expand565 (to_m64 (d), 0); + + vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0); + *dst = to_uint64 (vdest); w--; dst++; } } - _mm_empty(); + _mm_empty (); } -void -fbCompositeSolidMask_nx8888x8888Cmmx (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) +static void +mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, + pixman_composite_info_t *info) { - uint32_t src, srca; - uint32_t *dstLine; - uint32_t *maskLine; - int dstStride, maskStride; - __m64 vsrc, vsrca; + PIXMAN_COMPOSITE_ARGS (info); + uint32_t src; + uint32_t *dst_line; + uint32_t *mask_line; + int dst_stride, mask_stride; + __m64 vsrc, vsrca; - CHECKPOINT(); + CHECKPOINT (); - fbComposeGetSolid(pSrc, src, pDst->bits.format); + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); - srca = src >> 24; - if (srca == 0) + if (src == 0) return; - fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); - fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1); + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); - vsrc = load8888(src); - vsrca = expand_alpha(vsrc); + vsrc = load8888 (&src); + vsrca = expand_alpha (vsrc); while (height--) { int twidth = width; - uint32_t *p = (uint32_t *)maskLine; - uint32_t *q = (uint32_t *)dstLine; + uint32_t *p = (uint32_t *)mask_line; + uint32_t *q = (uint32_t *)dst_line; while (twidth && (unsigned long)q & 7) { @@ -1176,9 +1345,9 @@ fbCompositeSolidMask_nx8888x8888Cmmx (pixman_op_t op, if (m) { - __m64 vdest = load8888(*q); - vdest = in_over(vsrc, vsrca, load8888(m), vdest); - *q = store8888(vdest); + __m64 vdest = load8888 (q); + vdest = in_over (vsrc, vsrca, load8888 (&m), vdest); + store8888 (q, vdest); } twidth--; @@ -1197,12 +1366,12 @@ fbCompositeSolidMask_nx8888x8888Cmmx (pixman_op_t op, __m64 dest0, dest1; __m64 vdest = *(__m64 *)q; - dest0 = in_over(vsrc, vsrca, load8888(m0), - expand8888 (vdest, 0)); - dest1 = in_over(vsrc, vsrca, load8888(m1), - expand8888 (vdest, 1)); + dest0 = in_over (vsrc, vsrca, load8888 (&m0), + expand8888 (vdest, 0)); + dest1 = in_over (vsrc, vsrca, load8888 (&m1), + expand8888 (vdest, 1)); - *(__m64 *)q = pack8888(dest0, dest1); + *(__m64 *)q = pack8888 (dest0, dest1); } p += 2; @@ -1210,15 +1379,15 @@ fbCompositeSolidMask_nx8888x8888Cmmx (pixman_op_t op, twidth -= 2; } - while (twidth) + if (twidth) { uint32_t m = *(uint32_t *)p; if (m) { - __m64 vdest = load8888(*q); - vdest = in_over(vsrc, vsrca, load8888(m), vdest); - *q = store8888(vdest); + __m64 vdest = load8888 (q); + vdest = in_over (vsrc, vsrca, load8888 (&m), vdest); + store8888 (q, vdest); } twidth--; @@ -1226,59 +1395,49 @@ fbCompositeSolidMask_nx8888x8888Cmmx (pixman_op_t op, q++; } - dstLine += dstStride; - maskLine += maskStride; + dst_line += dst_stride; + mask_line += mask_stride; } - _mm_empty(); -} - -void -fbCompositeSrc_8888x8x8888mmx (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - uint32_t *dstLine, *dst; - uint32_t *srcLine, *src; - uint32_t mask; - __m64 vmask; - int dstStride, srcStride; - uint16_t w; - __m64 srca; - - CHECKPOINT(); - - fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); - fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); - - fbComposeGetSolid (pMask, mask, pDst->bits.format); + _mm_empty (); +} + +static void +mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ + PIXMAN_COMPOSITE_ARGS (info); + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + uint32_t mask; + __m64 vmask; + int dst_stride, src_stride; + int32_t w; + + CHECKPOINT (); + + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); + mask &= 0xff000000; mask = mask | mask >> 8 | mask >> 16 | mask >> 24; - vmask = load8888 (mask); - srca = MC(4x00ff); + vmask = load8888 (&mask); while (height--) { - dst = dstLine; - dstLine += dstStride; - src = srcLine; - srcLine += srcStride; + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; w = width; while (w && (unsigned long)dst & 7) { - __m64 s = load8888 (*src); - __m64 d = load8888 (*dst); + __m64 s = load8888 (src); + __m64 d = load8888 (dst); - *dst = store8888 (in_over (s, expand_alpha (s), vmask, d)); + store8888 (dst, in_over (s, expand_alpha (s), vmask, d)); w--; dst++; @@ -1287,82 +1446,71 @@ fbCompositeSrc_8888x8x8888mmx (pixman_op_t op, while (w >= 2) { - __m64 vs = *(__m64 *)src; + __m64 vs = ldq_u((uint64_t *)src); __m64 vd = *(__m64 *)dst; __m64 vsrc0 = expand8888 (vs, 0); __m64 vsrc1 = expand8888 (vs, 1); *(__m64 *)dst = pack8888 ( - in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)), - in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1))); + in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)), + in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1))); w -= 2; dst += 2; src += 2; } - while (w) + if (w) { - __m64 s = load8888 (*src); - __m64 d = load8888 (*dst); - - *dst = store8888 (in_over (s, expand_alpha (s), vmask, d)); + __m64 s = load8888 (src); + __m64 d = load8888 (dst); - w--; - dst++; - src++; + store8888 (dst, in_over (s, expand_alpha (s), vmask, d)); } } - _mm_empty(); -} - -void -fbCompositeSrc_x888xnx8888mmx (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - uint32_t *dstLine, *dst; - uint32_t *srcLine, *src; - uint32_t mask; - __m64 vmask; - int dstStride, srcStride; - uint16_t w; - __m64 srca; - - CHECKPOINT(); - - fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); - fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); - fbComposeGetSolid (pMask, mask, pDst->bits.format); + _mm_empty (); +} + +static void +mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ + PIXMAN_COMPOSITE_ARGS (info); + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + uint32_t mask; + __m64 vmask; + int dst_stride, src_stride; + int32_t w; + __m64 srca; + + CHECKPOINT (); + + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); + mask &= 0xff000000; mask = mask | mask >> 8 | mask >> 16 | mask >> 24; - vmask = load8888 (mask); - srca = MC(4x00ff); + vmask = load8888 (&mask); + srca = MC (4x00ff); while (height--) { - dst = dstLine; - dstLine += dstStride; - src = srcLine; - srcLine += srcStride; + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; w = width; while (w && (unsigned long)dst & 7) { - __m64 s = load8888 (*src | 0xff000000); - __m64 d = load8888 (*dst); + uint32_t ssrc = *src | 0xff000000; + __m64 s = load8888 (&ssrc); + __m64 d = load8888 (dst); - *dst = store8888 (in_over (s, srca, vmask, d)); + store8888 (dst, in_over (s, srca, vmask, d)); w--; dst++; @@ -1380,46 +1528,46 @@ fbCompositeSrc_x888xnx8888mmx (pixman_op_t op, __m64 vd6 = *(__m64 *)(dst + 12); __m64 vd7 = *(__m64 *)(dst + 14); - __m64 vs0 = *(__m64 *)(src + 0); - __m64 vs1 = *(__m64 *)(src + 2); - __m64 vs2 = *(__m64 *)(src + 4); - __m64 vs3 = *(__m64 *)(src + 6); - __m64 vs4 = *(__m64 *)(src + 8); - __m64 vs5 = *(__m64 *)(src + 10); - __m64 vs6 = *(__m64 *)(src + 12); - __m64 vs7 = *(__m64 *)(src + 14); + __m64 vs0 = ldq_u((uint64_t *)(src + 0)); + __m64 vs1 = ldq_u((uint64_t *)(src + 2)); + __m64 vs2 = ldq_u((uint64_t *)(src + 4)); + __m64 vs3 = ldq_u((uint64_t *)(src + 6)); + __m64 vs4 = ldq_u((uint64_t *)(src + 8)); + __m64 vs5 = ldq_u((uint64_t *)(src + 10)); + __m64 vs6 = ldq_u((uint64_t *)(src + 12)); + __m64 vs7 = ldq_u((uint64_t *)(src + 14)); vd0 = pack8888 ( - in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)), - in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1))); + in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)), + in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1))); vd1 = pack8888 ( - in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)), - in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1))); + in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)), + in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1))); vd2 = pack8888 ( - in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)), - in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1))); + in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)), + in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1))); vd3 = pack8888 ( - in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)), - in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1))); + in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)), + in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1))); vd4 = pack8888 ( - in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)), - in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1))); + in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)), + in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1))); vd5 = pack8888 ( - in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)), - in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1))); + in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)), + in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1))); - vd6 = pack8888 ( - in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)), - in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1))); + vd6 = pack8888 ( + in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)), + in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1))); vd7 = pack8888 ( - in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)), - in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1))); + in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)), + in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1))); *(__m64 *)(dst + 0) = vd0; *(__m64 *)(dst + 2) = vd1; @@ -1437,10 +1585,11 @@ fbCompositeSrc_x888xnx8888mmx (pixman_op_t op, while (w) { - __m64 s = load8888 (*src | 0xff000000); - __m64 d = load8888 (*dst); + uint32_t ssrc = *src | 0xff000000; + __m64 s = load8888 (&ssrc); + __m64 d = load8888 (dst); - *dst = store8888 (in_over (s, srca, vmask, d)); + store8888 (dst, in_over (s, srca, vmask, d)); w--; dst++; @@ -1448,133 +1597,121 @@ fbCompositeSrc_x888xnx8888mmx (pixman_op_t op, } } - _mm_empty(); + _mm_empty (); } -void -fbCompositeSrc_8888x8888mmx (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) +static void +mmx_composite_over_8888_8888 (pixman_implementation_t *imp, + pixman_composite_info_t *info) { - uint32_t *dstLine, *dst; - uint32_t *srcLine, *src; - uint32_t s; - int dstStride, srcStride; - uint8_t a; - uint16_t w; + PIXMAN_COMPOSITE_ARGS (info); + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + uint32_t s; + int dst_stride, src_stride; + uint8_t a; + int32_t w; - CHECKPOINT(); + CHECKPOINT (); - fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); - fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); while (height--) { - dst = dstLine; - dstLine += dstStride; - src = srcLine; - srcLine += srcStride; + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; w = width; while (w--) { s = *src++; a = s >> 24; + if (a == 0xff) + { *dst = s; - else if (a) { + } + else if (s) + { __m64 ms, sa; - ms = load8888(s); - sa = expand_alpha(ms); - *dst = store8888(over(ms, sa, load8888(*dst))); + ms = load8888 (&s); + sa = expand_alpha (ms); + store8888 (dst, over (ms, sa, load8888 (dst))); } + dst++; } } - _mm_empty(); + _mm_empty (); } -void -fbCompositeSrc_8888x0565mmx (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) +static void +mmx_composite_over_8888_0565 (pixman_implementation_t *imp, + pixman_composite_info_t *info) { - uint16_t *dstLine, *dst; - uint32_t *srcLine, *src; - int dstStride, srcStride; - uint16_t w; + PIXMAN_COMPOSITE_ARGS (info); + uint16_t *dst_line, *dst; + uint32_t *src_line, *src; + int dst_stride, src_stride; + int32_t w; - CHECKPOINT(); + CHECKPOINT (); - fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); - fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); #if 0 /* FIXME */ - assert (pSrc->pDrawable == pMask->pDrawable); + assert (src_image->drawable == mask_image->drawable); #endif while (height--) { - dst = dstLine; - dstLine += dstStride; - src = srcLine; - srcLine += srcStride; + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; w = width; - CHECKPOINT(); + CHECKPOINT (); while (w && (unsigned long)dst & 7) { - __m64 vsrc = load8888 (*src); + __m64 vsrc = load8888 (src); uint64_t d = *dst; - __m64 vdest = expand565 (M64(d), 0); + __m64 vdest = expand565 (to_m64 (d), 0); - vdest = pack565(over(vsrc, expand_alpha(vsrc), vdest), vdest, 0); + vdest = pack_565 ( + over (vsrc, expand_alpha (vsrc), vdest), vdest, 0); - *dst = UINT64(vdest); + *dst = to_uint64 (vdest); w--; dst++; src++; } - CHECKPOINT(); + CHECKPOINT (); while (w >= 4) { __m64 vsrc0, vsrc1, vsrc2, vsrc3; __m64 vdest; - vsrc0 = load8888(*(src + 0)); - vsrc1 = load8888(*(src + 1)); - vsrc2 = load8888(*(src + 2)); - vsrc3 = load8888(*(src + 3)); + vsrc0 = load8888 ((src + 0)); + vsrc1 = load8888 ((src + 1)); + vsrc2 = load8888 ((src + 2)); + vsrc3 = load8888 ((src + 3)); vdest = *(__m64 *)dst; - vdest = pack565(over(vsrc0, expand_alpha(vsrc0), expand565(vdest, 0)), vdest, 0); - vdest = pack565(over(vsrc1, expand_alpha(vsrc1), expand565(vdest, 1)), vdest, 1); - vdest = pack565(over(vsrc2, expand_alpha(vsrc2), expand565(vdest, 2)), vdest, 2); - vdest = pack565(over(vsrc3, expand_alpha(vsrc3), expand565(vdest, 3)), vdest, 3); + vdest = pack_565 (over (vsrc0, expand_alpha (vsrc0), expand565 (vdest, 0)), vdest, 0); + vdest = pack_565 (over (vsrc1, expand_alpha (vsrc1), expand565 (vdest, 1)), vdest, 1); + vdest = pack_565 (over (vsrc2, expand_alpha (vsrc2), expand565 (vdest, 2)), vdest, 2); + vdest = pack_565 (over (vsrc3, expand_alpha (vsrc3), expand565 (vdest, 3)), vdest, 3); *(__m64 *)dst = vdest; @@ -1583,17 +1720,17 @@ fbCompositeSrc_8888x0565mmx (pixman_op_t op, src += 4; } - CHECKPOINT(); + CHECKPOINT (); while (w) { - __m64 vsrc = load8888 (*src); + __m64 vsrc = load8888 (src); uint64_t d = *dst; - __m64 vdest = expand565 (M64(d), 0); + __m64 vdest = expand565 (to_m64 (d), 0); - vdest = pack565(over(vsrc, expand_alpha(vsrc), vdest), vdest, 0); + vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0); - *dst = UINT64(vdest); + *dst = to_uint64 (vdest); w--; dst++; @@ -1601,56 +1738,47 @@ fbCompositeSrc_8888x0565mmx (pixman_op_t op, } } - _mm_empty(); + _mm_empty (); } -void -fbCompositeSolidMask_nx8x8888mmx (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) +static void +mmx_composite_over_n_8_8888 (pixman_implementation_t *imp, + pixman_composite_info_t *info) { - uint32_t src, srca; - uint32_t *dstLine, *dst; - uint8_t *maskLine, *mask; - int dstStride, maskStride; - uint16_t w; - __m64 vsrc, vsrca; - uint64_t srcsrc; + PIXMAN_COMPOSITE_ARGS (info); + uint32_t src, srca; + uint32_t *dst_line, *dst; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + int32_t w; + __m64 vsrc, vsrca; + uint64_t srcsrc; - CHECKPOINT(); + CHECKPOINT (); - fbComposeGetSolid(pSrc, src, pDst->bits.format); + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); srca = src >> 24; - if (srca == 0) + if (src == 0) return; srcsrc = (uint64_t)src << 32 | src; - fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); - fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); - vsrc = load8888 (src); + vsrc = load8888 (&src); vsrca = expand_alpha (vsrc); while (height--) { - dst = dstLine; - dstLine += dstStride; - mask = maskLine; - maskLine += maskStride; + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; w = width; - CHECKPOINT(); + CHECKPOINT (); while (w && (unsigned long)dst & 7) { @@ -1658,8 +1786,11 @@ fbCompositeSolidMask_nx8x8888mmx (pixman_op_t op, if (m) { - __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev (M64(m)), load8888(*dst)); - *dst = store8888(vdest); + __m64 vdest = in_over (vsrc, vsrca, + expand_alpha_rev (to_m64 (m)), + load8888 (dst)); + + store8888 (dst, vdest); } w--; @@ -1667,11 +1798,12 @@ fbCompositeSolidMask_nx8x8888mmx (pixman_op_t op, dst++; } - CHECKPOINT(); + CHECKPOINT (); while (w >= 2) { uint64_t m0, m1; + m0 = *mask; m1 = *(mask + 1); @@ -1686,10 +1818,12 @@ fbCompositeSolidMask_nx8x8888mmx (pixman_op_t op, vdest = *(__m64 *)dst; - dest0 = in_over(vsrc, vsrca, expand_alpha_rev (M64(m0)), expand8888(vdest, 0)); - dest1 = in_over(vsrc, vsrca, expand_alpha_rev (M64(m1)), expand8888(vdest, 1)); + dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)), + expand8888 (vdest, 0)); + dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)), + expand8888 (vdest, 1)); - *(__m64 *)dst = pack8888(dest0, dest1); + *(__m64 *)dst = pack8888 (dest0, dest1); } mask += 2; @@ -1697,65 +1831,55 @@ fbCompositeSolidMask_nx8x8888mmx (pixman_op_t op, w -= 2; } - CHECKPOINT(); + CHECKPOINT (); - while (w) + if (w) { uint64_t m = *mask; if (m) { - __m64 vdest = load8888(*dst); - vdest = in_over(vsrc, vsrca, expand_alpha_rev (M64(m)), vdest); - *dst = store8888(vdest); - } + __m64 vdest = load8888 (dst); - w--; - mask++; - dst++; + vdest = in_over ( + vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest); + store8888 (dst, vdest); + } } } - _mm_empty(); + _mm_empty (); } pixman_bool_t pixman_fill_mmx (uint32_t *bits, - int stride, - int bpp, - int x, - int y, - int width, - int height, - uint32_t xor) -{ - uint64_t fill; - __m64 vfill; - uint32_t byte_width; - uint8_t *byte_line; -#ifdef __GNUC__ - __m64 v1, v2, v3, v4, v5, v6, v7; + int stride, + int bpp, + int x, + int y, + int width, + int height, + uint32_t xor) +{ + uint64_t fill; + __m64 vfill; + uint32_t byte_width; + uint8_t *byte_line; + +#if defined __GNUC__ && defined USE_X86_MMX + __m64 v1, v2, v3, v4, v5, v6, v7; #endif if (bpp != 16 && bpp != 32 && bpp != 8) return FALSE; - if (bpp == 16 && (xor >> 16 != (xor & 0xffff))) - return FALSE; - - if (bpp == 8 && - ((xor >> 16 != (xor & 0xffff)) || - (xor >> 24 != (xor & 0x00ff) >> 16))) - { - return FALSE; - } - if (bpp == 8) { stride = stride * (int) sizeof (uint32_t) / 1; byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); byte_width = width; stride *= 1; + xor = (xor & 0xff) * 0x01010101; } else if (bpp == 16) { @@ -1763,6 +1887,7 @@ pixman_fill_mmx (uint32_t *bits, byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); byte_width = 2 * width; stride *= 2; + xor = (xor & 0xffff) * 0x00010001; } else { @@ -1773,19 +1898,19 @@ pixman_fill_mmx (uint32_t *bits, } fill = ((uint64_t)xor << 32) | xor; - vfill = M64(fill); + vfill = to_m64 (fill); -#ifdef __GNUC__ +#if defined __GNUC__ && defined USE_X86_MMX __asm__ ( - "movq %7, %0\n" - "movq %7, %1\n" - "movq %7, %2\n" - "movq %7, %3\n" - "movq %7, %4\n" - "movq %7, %5\n" - "movq %7, %6\n" - : "=y" (v1), "=y" (v2), "=y" (v3), - "=y" (v4), "=y" (v5), "=y" (v6), "=y" (v7) + "movq %7, %0\n" + "movq %7, %1\n" + "movq %7, %2\n" + "movq %7, %3\n" + "movq %7, %4\n" + "movq %7, %5\n" + "movq %7, %6\n" + : "=&y" (v1), "=&y" (v2), "=&y" (v3), + "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7) : "y" (vfill)); #endif @@ -1793,17 +1918,18 @@ pixman_fill_mmx (uint32_t *bits, { int w; uint8_t *d = byte_line; + byte_line += stride; w = byte_width; - while (w >= 1 && ((unsigned long)d & 1)) + if (w >= 1 && ((unsigned long)d & 1)) { *(uint8_t *)d = (xor & 0xff); w--; d++; } - - while (w >= 2 && ((unsigned long)d & 3)) + + if (w >= 2 && ((unsigned long)d & 3)) { *(uint16_t *)d = xor; w -= 2; @@ -1820,16 +1946,16 @@ pixman_fill_mmx (uint32_t *bits, while (w >= 64) { -#ifdef __GNUC__ +#if defined __GNUC__ && defined USE_X86_MMX __asm__ ( - "movq %1, (%0)\n" - "movq %2, 8(%0)\n" - "movq %3, 16(%0)\n" - "movq %4, 24(%0)\n" - "movq %5, 32(%0)\n" - "movq %6, 40(%0)\n" - "movq %7, 48(%0)\n" - "movq %8, 56(%0)\n" + "movq %1, (%0)\n" + "movq %2, 8(%0)\n" + "movq %3, 16(%0)\n" + "movq %4, 24(%0)\n" + "movq %5, 32(%0)\n" + "movq %6, 40(%0)\n" + "movq %7, 48(%0)\n" + "movq %8, 56(%0)\n" : : "r" (d), "y" (vfill), "y" (v1), "y" (v2), "y" (v3), @@ -1856,76 +1982,67 @@ pixman_fill_mmx (uint32_t *bits, w -= 4; d += 4; } - while (w >= 2) + if (w >= 2) { *(uint16_t *)d = xor; w -= 2; d += 2; } - while (w >= 1) + if (w >= 1) { *(uint8_t *)d = (xor & 0xff); w--; d++; } - + } - _mm_empty(); + _mm_empty (); return TRUE; } -void -fbCompositeSolidMaskSrc_nx8x8888mmx (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - uint32_t src, srca; - uint32_t *dstLine, *dst; - uint8_t *maskLine, *mask; - int dstStride, maskStride; - uint16_t w; - __m64 vsrc, vsrca; - uint64_t srcsrc; - - CHECKPOINT(); - - fbComposeGetSolid(pSrc, src, pDst->bits.format); +static void +mmx_composite_src_n_8_8888 (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ + PIXMAN_COMPOSITE_ARGS (info); + uint32_t src, srca; + uint32_t *dst_line, *dst; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + int32_t w; + __m64 vsrc; + uint64_t srcsrc; + + CHECKPOINT (); + + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); srca = src >> 24; - if (srca == 0) + if (src == 0) { - pixman_fill_mmx (pDst->bits.bits, pDst->bits.rowstride, PIXMAN_FORMAT_BPP (pDst->bits.format), - xDst, yDst, width, height, 0); + pixman_fill_mmx (dest_image->bits.bits, dest_image->bits.rowstride, + PIXMAN_FORMAT_BPP (dest_image->bits.format), + dest_x, dest_y, width, height, 0); return; } srcsrc = (uint64_t)src << 32 | src; - fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); - fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); - vsrc = load8888 (src); - vsrca = expand_alpha (vsrc); + vsrc = load8888 (&src); while (height--) { - dst = dstLine; - dstLine += dstStride; - mask = maskLine; - maskLine += maskStride; + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; w = width; - CHECKPOINT(); + CHECKPOINT (); while (w && (unsigned long)dst & 7) { @@ -1933,8 +2050,9 @@ fbCompositeSolidMaskSrc_nx8x8888mmx (pixman_op_t op, if (m) { - __m64 vdest = in(vsrc, expand_alpha_rev (M64(m))); - *dst = store8888(vdest); + __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m))); + + store8888 (dst, vdest); } else { @@ -1946,7 +2064,7 @@ fbCompositeSolidMaskSrc_nx8x8888mmx (pixman_op_t op, dst++; } - CHECKPOINT(); + CHECKPOINT (); while (w >= 2) { @@ -1960,15 +2078,12 @@ fbCompositeSolidMaskSrc_nx8x8888mmx (pixman_op_t op, } else if (m0 | m1) { - __m64 vdest; __m64 dest0, dest1; - vdest = *(__m64 *)dst; - - dest0 = in(vsrc, expand_alpha_rev (M64(m0))); - dest1 = in(vsrc, expand_alpha_rev (M64(m1))); + dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0))); + dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1))); - *(__m64 *)dst = pack8888(dest0, dest1); + *(__m64 *)dst = pack8888 (dest0, dest1); } else { @@ -1980,83 +2095,72 @@ fbCompositeSolidMaskSrc_nx8x8888mmx (pixman_op_t op, w -= 2; } - CHECKPOINT(); + CHECKPOINT (); - while (w) + if (w) { uint64_t m = *mask; if (m) { - __m64 vdest = load8888(*dst); - vdest = in(vsrc, expand_alpha_rev (M64(m))); - *dst = store8888(vdest); + __m64 vdest = load8888 (dst); + + vdest = in (vsrc, expand_alpha_rev (to_m64 (m))); + store8888 (dst, vdest); } else { *dst = 0; } - - w--; - mask++; - dst++; } } - _mm_empty(); -} - -void -fbCompositeSolidMask_nx8x0565mmx (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - uint32_t src, srca; - uint16_t *dstLine, *dst; - uint8_t *maskLine, *mask; - int dstStride, maskStride; - uint16_t w; - __m64 vsrc, vsrca, tmp; + _mm_empty (); +} + +static void +mmx_composite_over_n_8_0565 (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ + PIXMAN_COMPOSITE_ARGS (info); + uint32_t src, srca; + uint16_t *dst_line, *dst; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + int32_t w; + __m64 vsrc, vsrca, tmp; uint64_t srcsrcsrcsrc, src16; - CHECKPOINT(); + CHECKPOINT (); - fbComposeGetSolid(pSrc, src, pDst->bits.format); + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); srca = src >> 24; - if (srca == 0) + if (src == 0) return; - fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); - fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); - vsrc = load8888 (src); + vsrc = load8888 (&src); vsrca = expand_alpha (vsrc); - tmp = pack565(vsrc, _mm_setzero_si64(), 0); - src16 = UINT64(tmp); + tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0); + src16 = to_uint64 (tmp); - srcsrcsrcsrc = (uint64_t)src16 << 48 | (uint64_t)src16 << 32 | + srcsrcsrcsrc = + (uint64_t)src16 << 48 | (uint64_t)src16 << 32 | (uint64_t)src16 << 16 | (uint64_t)src16; while (height--) { - dst = dstLine; - dstLine += dstStride; - mask = maskLine; - maskLine += maskStride; + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; w = width; - CHECKPOINT(); + CHECKPOINT (); while (w && (unsigned long)dst & 7) { @@ -2065,10 +2169,12 @@ fbCompositeSolidMask_nx8x0565mmx (pixman_op_t op, if (m) { uint64_t d = *dst; - __m64 vd = M64(d); - __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev (M64 (m)), expand565(vd, 0)); - vd = pack565(vdest, _mm_setzero_si64(), 0); - *dst = UINT64(vd); + __m64 vd = to_m64 (d); + __m64 vdest = in_over ( + vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0)); + + vd = pack_565 (vdest, _mm_setzero_si64 (), 0); + *dst = to_uint64 (vd); } w--; @@ -2076,7 +2182,7 @@ fbCompositeSolidMask_nx8x0565mmx (pixman_op_t op, dst++; } - CHECKPOINT(); + CHECKPOINT (); while (w >= 4) { @@ -2097,14 +2203,18 @@ fbCompositeSolidMask_nx8x0565mmx (pixman_op_t op, vdest = *(__m64 *)dst; - vm0 = M64(m0); - vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0); - vm1 = M64(m1); - vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1); - vm2 = M64(m2); - vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2); - vm3 = M64(m3); - vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3); + vm0 = to_m64 (m0); + vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm0), + expand565 (vdest, 0)), vdest, 0); + vm1 = to_m64 (m1); + vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm1), + expand565 (vdest, 1)), vdest, 1); + vm2 = to_m64 (m2); + vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm2), + expand565 (vdest, 2)), vdest, 2); + vm3 = to_m64 (m3); + vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm3), + expand565 (vdest, 3)), vdest, 3); *(__m64 *)dst = vdest; } @@ -2114,7 +2224,7 @@ fbCompositeSolidMask_nx8x0565mmx (pixman_op_t op, dst += 4; } - CHECKPOINT(); + CHECKPOINT (); while (w) { @@ -2123,10 +2233,11 @@ fbCompositeSolidMask_nx8x0565mmx (pixman_op_t op, if (m) { uint64_t d = *dst; - __m64 vd = M64(d); - __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev (M64(m)), expand565(vd, 0)); - vd = pack565(vdest, _mm_setzero_si64(), 0); - *dst = UINT64(vd); + __m64 vd = to_m64 (d); + __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)), + expand565 (vd, 0)); + vd = pack_565 (vdest, _mm_setzero_si64 (), 0); + *dst = to_uint64 (vd); } w--; @@ -2135,64 +2246,55 @@ fbCompositeSolidMask_nx8x0565mmx (pixman_op_t op, } } - _mm_empty(); + _mm_empty (); } -void -fbCompositeSrc_8888RevNPx0565mmx (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) +static void +mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp, + pixman_composite_info_t *info) { - uint16_t *dstLine, *dst; - uint32_t *srcLine, *src; - int dstStride, srcStride; - uint16_t w; + PIXMAN_COMPOSITE_ARGS (info); + uint16_t *dst_line, *dst; + uint32_t *src_line, *src; + int dst_stride, src_stride; + int32_t w; - CHECKPOINT(); + CHECKPOINT (); - fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); - fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); #if 0 /* FIXME */ - assert (pSrc->pDrawable == pMask->pDrawable); + assert (src_image->drawable == mask_image->drawable); #endif while (height--) { - dst = dstLine; - dstLine += dstStride; - src = srcLine; - srcLine += srcStride; + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; w = width; - CHECKPOINT(); + CHECKPOINT (); while (w && (unsigned long)dst & 7) { - __m64 vsrc = load8888 (*src); + __m64 vsrc = load8888 (src); uint64_t d = *dst; - __m64 vdest = expand565 (M64(d), 0); + __m64 vdest = expand565 (to_m64 (d), 0); - vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0); + vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0); - *dst = UINT64(vdest); + *dst = to_uint64 (vdest); w--; dst++; src++; } - CHECKPOINT(); + CHECKPOINT (); while (w >= 4) { @@ -2212,21 +2314,21 @@ fbCompositeSrc_8888RevNPx0565mmx (pixman_op_t op, if ((a0 & a1 & a2 & a3) == 0xFF) { __m64 vdest; - vdest = pack565(invert_colors(load8888(s0)), _mm_setzero_si64(), 0); - vdest = pack565(invert_colors(load8888(s1)), vdest, 1); - vdest = pack565(invert_colors(load8888(s2)), vdest, 2); - vdest = pack565(invert_colors(load8888(s3)), vdest, 3); + vdest = pack_565 (invert_colors (load8888 (&s0)), _mm_setzero_si64 (), 0); + vdest = pack_565 (invert_colors (load8888 (&s1)), vdest, 1); + vdest = pack_565 (invert_colors (load8888 (&s2)), vdest, 2); + vdest = pack_565 (invert_colors (load8888 (&s3)), vdest, 3); *(__m64 *)dst = vdest; } - else if (a0 | a1 | a2 | a3) + else if (s0 | s1 | s2 | s3) { __m64 vdest = *(__m64 *)dst; - vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0); - vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1); - vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2); - vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3); + vdest = pack_565 (over_rev_non_pre (load8888 (&s0), expand565 (vdest, 0)), vdest, 0); + vdest = pack_565 (over_rev_non_pre (load8888 (&s1), expand565 (vdest, 1)), vdest, 1); + vdest = pack_565 (over_rev_non_pre (load8888 (&s2), expand565 (vdest, 2)), vdest, 2); + vdest = pack_565 (over_rev_non_pre (load8888 (&s3), expand565 (vdest, 3)), vdest, 3); *(__m64 *)dst = vdest; } @@ -2236,17 +2338,17 @@ fbCompositeSrc_8888RevNPx0565mmx (pixman_op_t op, src += 4; } - CHECKPOINT(); + CHECKPOINT (); while (w) { - __m64 vsrc = load8888 (*src); + __m64 vsrc = load8888 (src); uint64_t d = *dst; - __m64 vdest = expand565 (M64(d), 0); + __m64 vdest = expand565 (to_m64 (d), 0); - vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0); + vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0); - *dst = UINT64(vdest); + *dst = to_uint64 (vdest); w--; dst++; @@ -2254,54 +2356,43 @@ fbCompositeSrc_8888RevNPx0565mmx (pixman_op_t op, } } - _mm_empty(); + _mm_empty (); } -/* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */ - -void -fbCompositeSrc_8888RevNPx8888mmx (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) +static void +mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp, + pixman_composite_info_t *info) { - uint32_t *dstLine, *dst; - uint32_t *srcLine, *src; - int dstStride, srcStride; - uint16_t w; + PIXMAN_COMPOSITE_ARGS (info); + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + int dst_stride, src_stride; + int32_t w; - CHECKPOINT(); + CHECKPOINT (); - fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); - fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); #if 0 /* FIXME */ - assert (pSrc->pDrawable == pMask->pDrawable); + assert (src_image->drawable == mask_image->drawable); #endif while (height--) { - dst = dstLine; - dstLine += dstStride; - src = srcLine; - srcLine += srcStride; + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; w = width; while (w && (unsigned long)dst & 7) { - __m64 s = load8888 (*src); - __m64 d = load8888 (*dst); + __m64 s = load8888 (src); + __m64 d = load8888 (dst); - *dst = store8888 (over_rev_non_pre (s, d)); + store8888 (dst, over_rev_non_pre (s, d)); w--; dst++; @@ -2310,7 +2401,7 @@ fbCompositeSrc_8888RevNPx8888mmx (pixman_op_t op, while (w >= 2) { - uint64_t s0, s1; + uint32_t s0, s1; unsigned char a0, a1; __m64 d0, d1; @@ -2322,17 +2413,17 @@ fbCompositeSrc_8888RevNPx8888mmx (pixman_op_t op, if ((a0 & a1) == 0xFF) { - d0 = invert_colors(load8888(s0)); - d1 = invert_colors(load8888(s1)); + d0 = invert_colors (load8888 (&s0)); + d1 = invert_colors (load8888 (&s1)); *(__m64 *)dst = pack8888 (d0, d1); } - else if (a0 | a1) + else if (s0 | s1) { __m64 vdest = *(__m64 *)dst; - d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0)); - d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1)); + d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0)); + d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1)); *(__m64 *)dst = pack8888 (d0, d1); } @@ -2342,61 +2433,47 @@ fbCompositeSrc_8888RevNPx8888mmx (pixman_op_t op, src += 2; } - while (w) + if (w) { - __m64 s = load8888 (*src); - __m64 d = load8888 (*dst); + __m64 s = load8888 (src); + __m64 d = load8888 (dst); - *dst = store8888 (over_rev_non_pre (s, d)); - - w--; - dst++; - src++; + store8888 (dst, over_rev_non_pre (s, d)); } } - _mm_empty(); + _mm_empty (); } -void -fbCompositeSolidMask_nx8888x0565Cmmx (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) +static void +mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, + pixman_composite_info_t *info) { - uint32_t src, srca; - uint16_t *dstLine; - uint32_t *maskLine; - int dstStride, maskStride; - __m64 vsrc, vsrca; + PIXMAN_COMPOSITE_ARGS (info); + uint32_t src; + uint16_t *dst_line; + uint32_t *mask_line; + int dst_stride, mask_stride; + __m64 vsrc, vsrca; - CHECKPOINT(); + CHECKPOINT (); - fbComposeGetSolid(pSrc, src, pDst->bits.format); + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); - srca = src >> 24; - if (srca == 0) + if (src == 0) return; - fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); - fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1); + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); - vsrc = load8888 (src); + vsrc = load8888 (&src); vsrca = expand_alpha (vsrc); while (height--) { int twidth = width; - uint32_t *p = (uint32_t *)maskLine; - uint16_t *q = (uint16_t *)dstLine; + uint32_t *p = (uint32_t *)mask_line; + uint16_t *q = (uint16_t *)dst_line; while (twidth && ((unsigned long)q & 7)) { @@ -2405,9 +2482,9 @@ fbCompositeSolidMask_nx8888x0565Cmmx (pixman_op_t op, if (m) { uint64_t d = *q; - __m64 vdest = expand565 (M64(d), 0); - vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0); - *q = UINT64(vdest); + __m64 vdest = expand565 (to_m64 (d), 0); + vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0); + *q = to_uint64 (vdest); } twidth--; @@ -2428,10 +2505,10 @@ fbCompositeSolidMask_nx8888x0565Cmmx (pixman_op_t op, { __m64 vdest = *(__m64 *)q; - vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0); - vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1); - vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2); - vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3); + vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m0), expand565 (vdest, 0)), vdest, 0); + vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m1), expand565 (vdest, 1)), vdest, 1); + vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m2), expand565 (vdest, 2)), vdest, 2); + vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m3), expand565 (vdest, 3)), vdest, 3); *(__m64 *)q = vdest; } @@ -2448,9 +2525,9 @@ fbCompositeSolidMask_nx8888x0565Cmmx (pixman_op_t op, if (m) { uint64_t d = *q; - __m64 vdest = expand565(M64(d), 0); - vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0); - *q = UINT64(vdest); + __m64 vdest = expand565 (to_m64 (d), 0); + vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0); + *q = to_uint64 (vdest); } twidth--; @@ -2458,141 +2535,140 @@ fbCompositeSolidMask_nx8888x0565Cmmx (pixman_op_t op, q++; } - maskLine += maskStride; - dstLine += dstStride; + mask_line += mask_stride; + dst_line += dst_stride; } _mm_empty (); } -void -fbCompositeIn_nx8x8mmx (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - uint8_t *dstLine, *dst; - uint8_t *maskLine, *mask; - int dstStride, maskStride; - uint16_t w; - uint32_t src; - uint8_t sa; - __m64 vsrc, vsrca; - - fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1); - fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); - - fbComposeGetSolid(pSrc, src, pDst->bits.format); +static void +mmx_composite_in_n_8_8 (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ + PIXMAN_COMPOSITE_ARGS (info); + uint8_t *dst_line, *dst; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + int32_t w; + uint32_t src; + uint8_t sa; + __m64 vsrc, vsrca; + + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); sa = src >> 24; - if (sa == 0) - return; - vsrc = load8888(src); - vsrca = expand_alpha(vsrc); + vsrc = load8888 (&src); + vsrca = expand_alpha (vsrc); while (height--) { - dst = dstLine; - dstLine += dstStride; - mask = maskLine; - maskLine += maskStride; + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; w = width; - if ((((unsigned long)pDst & 3) == 0) && - (((unsigned long)pSrc & 3) == 0)) + while (w && (unsigned long)dst & 7) { - while (w >= 4) - { - uint32_t m; - __m64 vmask; - __m64 vdest; + uint16_t tmp; + uint8_t a; + uint32_t m, d; - m = 0; + a = *mask++; + d = *dst; - vmask = load8888 (*(uint32_t *)mask); - vdest = load8888 (*(uint32_t *)dst); + m = MUL_UN8 (sa, a, tmp); + d = MUL_UN8 (m, d, tmp); - *(uint32_t *)dst = store8888 (in (in (vsrca, vmask), vdest)); + *dst++ = d; + w--; + } - dst += 4; - mask += 4; - w -= 4; - } + while (w >= 4) + { + __m64 vmask; + __m64 vdest; + + vmask = load8888u ((uint32_t *)mask); + vdest = load8888 ((uint32_t *)dst); + + store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest)); + + dst += 4; + mask += 4; + w -= 4; } while (w--) { - uint16_t tmp; - uint8_t a; - uint32_t m, d; - uint32_t r; + uint16_t tmp; + uint8_t a; + uint32_t m, d; a = *mask++; d = *dst; - m = FbInU (sa, 0, a, tmp); - r = FbInU (m, 0, d, tmp); + m = MUL_UN8 (sa, a, tmp); + d = MUL_UN8 (m, d, tmp); - *dst++ = r; + *dst++ = d; } } - _mm_empty(); + _mm_empty (); } -void -fbCompositeIn_8x8mmx (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) +static void +mmx_composite_in_8_8 (pixman_implementation_t *imp, + pixman_composite_info_t *info) { - uint8_t *dstLine, *dst; - uint8_t *srcLine, *src; - int srcStride, dstStride; - uint16_t w; + PIXMAN_COMPOSITE_ARGS (info); + uint8_t *dst_line, *dst; + uint8_t *src_line, *src; + int src_stride, dst_stride; + int32_t w; - fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1); - fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1); + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); while (height--) { - dst = dstLine; - dstLine += dstStride; - src = srcLine; - srcLine += srcStride; + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; w = width; - if ((((unsigned long)pDst & 3) == 0) && - (((unsigned long)pSrc & 3) == 0)) + while (w && (unsigned long)dst & 3) { - while (w >= 4) - { - uint32_t *s = (uint32_t *)src; - uint32_t *d = (uint32_t *)dst; + uint8_t s, d; + uint16_t tmp; - *d = store8888 (in (load8888 (*s), load8888 (*d))); + s = *src; + d = *dst; - w -= 4; - dst += 4; - src += 4; - } + *dst = MUL_UN8 (s, d, tmp); + + src++; + dst++; + w--; + } + + while (w >= 4) + { + uint32_t *s = (uint32_t *)src; + uint32_t *d = (uint32_t *)dst; + + store8888 (d, in (load8888u (s), load8888 (d))); + + w -= 4; + dst += 4; + src += 4; } while (w--) @@ -2603,7 +2679,7 @@ fbCompositeIn_8x8mmx (pixman_op_t op, s = *src; d = *dst; - *dst = FbInU (s, 0, d, tmp); + *dst = MUL_UN8 (s, d, tmp); src++; dst++; @@ -2613,116 +2689,115 @@ fbCompositeIn_8x8mmx (pixman_op_t op, _mm_empty (); } -void -fbCompositeSrcAdd_8888x8x8mmx (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - uint8_t *dstLine, *dst; - uint8_t *maskLine, *mask; - int dstStride, maskStride; - uint16_t w; - uint32_t src; - uint8_t sa; - __m64 vsrc, vsrca; - - fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1); - fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); - - fbComposeGetSolid(pSrc, src, pDst->bits.format); +static void +mmx_composite_add_n_8_8 (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ + PIXMAN_COMPOSITE_ARGS (info); + uint8_t *dst_line, *dst; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + int32_t w; + uint32_t src; + uint8_t sa; + __m64 vsrc, vsrca; + + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); sa = src >> 24; - if (sa == 0) + + if (src == 0) return; - vsrc = load8888(src); - vsrca = expand_alpha(vsrc); + vsrc = load8888 (&src); + vsrca = expand_alpha (vsrc); while (height--) { - dst = dstLine; - dstLine += dstStride; - mask = maskLine; - maskLine += maskStride; + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; w = width; - if ((((unsigned long)pMask & 3) == 0) && - (((unsigned long)pDst & 3) == 0)) + while (w && (unsigned long)dst & 3) { - while (w >= 4) - { - __m64 vmask = load8888 (*(uint32_t *)mask); - __m64 vdest = load8888 (*(uint32_t *)dst); + uint16_t tmp; + uint16_t a; + uint32_t m, d; + uint32_t r; - *(uint32_t *)dst = store8888 (_mm_adds_pu8 (in (vsrca, vmask), vdest)); + a = *mask++; + d = *dst; - w -= 4; - dst += 4; - mask += 4; - } + m = MUL_UN8 (sa, a, tmp); + r = ADD_UN8 (m, d, tmp); + + *dst++ = r; + w--; + } + + while (w >= 4) + { + __m64 vmask; + __m64 vdest; + + vmask = load8888u ((uint32_t *)mask); + vdest = load8888 ((uint32_t *)dst); + + store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest)); + + dst += 4; + mask += 4; + w -= 4; } while (w--) { - uint16_t tmp; - uint16_t a; - uint32_t m, d; - uint32_t r; + uint16_t tmp; + uint16_t a; + uint32_t m, d; + uint32_t r; a = *mask++; d = *dst; - m = FbInU (sa, 0, a, tmp); - r = FbAdd (m, d, 0, tmp); + m = MUL_UN8 (sa, a, tmp); + r = ADD_UN8 (m, d, tmp); *dst++ = r; } } - _mm_empty(); + _mm_empty (); } -void -fbCompositeSrcAdd_8000x8000mmx (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) +static void +mmx_composite_add_8_8 (pixman_implementation_t *imp, + pixman_composite_info_t *info) { - uint8_t *dstLine, *dst; - uint8_t *srcLine, *src; - int dstStride, srcStride; - uint16_t w; - uint8_t s, d; - uint16_t t; + PIXMAN_COMPOSITE_ARGS (info); + uint8_t *dst_line, *dst; + uint8_t *src_line, *src; + int dst_stride, src_stride; + int32_t w; + uint8_t s, d; + uint16_t t; - CHECKPOINT(); + CHECKPOINT (); - fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1); - fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); while (height--) { - dst = dstLine; - dstLine += dstStride; - src = srcLine; - srcLine += srcStride; + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; w = width; while (w && (unsigned long)dst & 7) @@ -2740,7 +2815,7 @@ fbCompositeSrcAdd_8000x8000mmx (pixman_op_t op, while (w >= 8) { - *(__m64*)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst); + *(__m64*)dst = _mm_adds_pu8 (ldq_u((uint64_t *)src), *(__m64*)dst); dst += 8; src += 8; w -= 8; @@ -2760,46 +2835,37 @@ fbCompositeSrcAdd_8000x8000mmx (pixman_op_t op, } } - _mm_empty(); + _mm_empty (); } -void -fbCompositeSrcAdd_8888x8888mmx (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) +static void +mmx_composite_add_8888_8888 (pixman_implementation_t *imp, + pixman_composite_info_t *info) { + PIXMAN_COMPOSITE_ARGS (info); __m64 dst64; - uint32_t *dstLine, *dst; - uint32_t *srcLine, *src; - int dstStride, srcStride; - uint16_t w; + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + int dst_stride, src_stride; + int32_t w; - CHECKPOINT(); + CHECKPOINT (); - fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); - fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); while (height--) { - dst = dstLine; - dstLine += dstStride; - src = srcLine; - srcLine += srcStride; + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; w = width; while (w && (unsigned long)dst & 7) { - *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src), - _mm_cvtsi32_si64(*dst))); + store (dst, _mm_adds_pu8 (_mm_cvtsi32_si64 (*src), + _mm_cvtsi32_si64 (*dst))); dst++; src++; w--; @@ -2807,8 +2873,8 @@ fbCompositeSrcAdd_8888x8888mmx (pixman_op_t op, while (w >= 2) { - dst64 = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst); - *(uint64_t*)dst = UINT64(dst64); + dst64 = _mm_adds_pu8 (ldq_u((uint64_t *)src), *(__m64*)dst); + *(uint64_t*)dst = to_uint64 (dst64); dst += 2; src += 2; w -= 2; @@ -2816,29 +2882,32 @@ fbCompositeSrcAdd_8888x8888mmx (pixman_op_t op, if (w) { - *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src), - _mm_cvtsi32_si64(*dst))); + store (dst, _mm_adds_pu8 (_mm_cvtsi32_si64 (*src), + _mm_cvtsi32_si64 (*dst))); } } - _mm_empty(); + _mm_empty (); } -pixman_bool_t +static pixman_bool_t pixman_blt_mmx (uint32_t *src_bits, - uint32_t *dst_bits, - int src_stride, - int dst_stride, - int src_bpp, - int dst_bpp, - int src_x, int src_y, - int dst_x, int dst_y, - int width, int height) -{ - uint8_t * src_bytes; - uint8_t * dst_bytes; - int byte_width; + uint32_t *dst_bits, + int src_stride, + int dst_stride, + int src_bpp, + int dst_bpp, + int src_x, + int src_y, + int dest_x, + int dest_y, + int width, + int height) +{ + uint8_t * src_bytes; + uint8_t * dst_bytes; + int byte_width; if (src_bpp != dst_bpp) return FALSE; @@ -2848,19 +2917,23 @@ pixman_blt_mmx (uint32_t *src_bits, src_stride = src_stride * (int) sizeof (uint32_t) / 2; dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); - dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); + dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); byte_width = 2 * width; src_stride *= 2; dst_stride *= 2; - } else if (src_bpp == 32) { + } + else if (src_bpp == 32) + { src_stride = src_stride * (int) sizeof (uint32_t) / 4; dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); - dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); + dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); byte_width = 4 * width; src_stride *= 4; dst_stride *= 4; - } else { + } + else + { return FALSE; } @@ -2873,7 +2946,15 @@ pixman_blt_mmx (uint32_t *src_bits, dst_bytes += dst_stride; w = byte_width; - while (w >= 2 && ((unsigned long)d & 3)) + if (w >= 1 && ((unsigned long)d & 1)) + { + *(uint8_t *)d = *(uint8_t *)s; + w -= 1; + s += 1; + d += 1; + } + + if (w >= 2 && ((unsigned long)d & 3)) { *(uint16_t *)d = *(uint16_t *)s; w -= 2; @@ -2883,7 +2964,7 @@ pixman_blt_mmx (uint32_t *src_bits, while (w >= 4 && ((unsigned long)d & 7)) { - *(uint32_t *)d = *(uint32_t *)s; + *(uint32_t *)d = ldl_u((uint32_t *)s); w -= 4; s += 4; @@ -2892,39 +2973,39 @@ pixman_blt_mmx (uint32_t *src_bits, while (w >= 64) { -#if defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)) +#if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX __asm__ ( - "movq (%1), %%mm0\n" - "movq 8(%1), %%mm1\n" - "movq 16(%1), %%mm2\n" - "movq 24(%1), %%mm3\n" - "movq 32(%1), %%mm4\n" - "movq 40(%1), %%mm5\n" - "movq 48(%1), %%mm6\n" - "movq 56(%1), %%mm7\n" - - "movq %%mm0, (%0)\n" - "movq %%mm1, 8(%0)\n" - "movq %%mm2, 16(%0)\n" - "movq %%mm3, 24(%0)\n" - "movq %%mm4, 32(%0)\n" - "movq %%mm5, 40(%0)\n" - "movq %%mm6, 48(%0)\n" - "movq %%mm7, 56(%0)\n" + "movq (%1), %%mm0\n" + "movq 8(%1), %%mm1\n" + "movq 16(%1), %%mm2\n" + "movq 24(%1), %%mm3\n" + "movq 32(%1), %%mm4\n" + "movq 40(%1), %%mm5\n" + "movq 48(%1), %%mm6\n" + "movq 56(%1), %%mm7\n" + + "movq %%mm0, (%0)\n" + "movq %%mm1, 8(%0)\n" + "movq %%mm2, 16(%0)\n" + "movq %%mm3, 24(%0)\n" + "movq %%mm4, 32(%0)\n" + "movq %%mm5, 40(%0)\n" + "movq %%mm6, 48(%0)\n" + "movq %%mm7, 56(%0)\n" : : "r" (d), "r" (s) : "memory", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); #else - __m64 v0 = *(__m64 *)(s + 0); - __m64 v1 = *(__m64 *)(s + 8); - __m64 v2 = *(__m64 *)(s + 16); - __m64 v3 = *(__m64 *)(s + 24); - __m64 v4 = *(__m64 *)(s + 32); - __m64 v5 = *(__m64 *)(s + 40); - __m64 v6 = *(__m64 *)(s + 48); - __m64 v7 = *(__m64 *)(s + 56); + __m64 v0 = ldq_u((uint64_t *)(s + 0)); + __m64 v1 = ldq_u((uint64_t *)(s + 8)); + __m64 v2 = ldq_u((uint64_t *)(s + 16)); + __m64 v3 = ldq_u((uint64_t *)(s + 24)); + __m64 v4 = ldq_u((uint64_t *)(s + 32)); + __m64 v5 = ldq_u((uint64_t *)(s + 40)); + __m64 v6 = ldq_u((uint64_t *)(s + 48)); + __m64 v7 = ldq_u((uint64_t *)(s + 56)); *(__m64 *)(d + 0) = v0; *(__m64 *)(d + 8) = v1; *(__m64 *)(d + 16) = v2; @@ -2941,7 +3022,7 @@ pixman_blt_mmx (uint32_t *src_bits, } while (w >= 4) { - *(uint32_t *)d = *(uint32_t *)s; + *(uint32_t *)d = ldl_u((uint32_t *)s); w -= 4; s += 4; @@ -2956,66 +3037,49 @@ pixman_blt_mmx (uint32_t *src_bits, } } - _mm_empty(); + _mm_empty (); return TRUE; } -void -fbCompositeCopyAreammx (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - pixman_blt_mmx (pSrc->bits.bits, - pDst->bits.bits, - pSrc->bits.rowstride, - pDst->bits.rowstride, - PIXMAN_FORMAT_BPP (pSrc->bits.format), - PIXMAN_FORMAT_BPP (pDst->bits.format), - xSrc, ySrc, xDst, yDst, width, height); -} - -void -fbCompositeOver_x888x8x8888mmx (pixman_op_t op, - pixman_image_t * pSrc, - pixman_image_t * pMask, - pixman_image_t * pDst, - int16_t xSrc, - int16_t ySrc, - int16_t xMask, - int16_t yMask, - int16_t xDst, - int16_t yDst, - uint16_t width, - uint16_t height) -{ - uint32_t *src, *srcLine; - uint32_t *dst, *dstLine; - uint8_t *mask, *maskLine; - int srcStride, maskStride, dstStride; - uint16_t w; - - fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); - fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); - fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); +static void +mmx_composite_copy_area (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ + PIXMAN_COMPOSITE_ARGS (info); + + pixman_blt_mmx (src_image->bits.bits, + dest_image->bits.bits, + src_image->bits.rowstride, + dest_image->bits.rowstride, + PIXMAN_FORMAT_BPP (src_image->bits.format), + PIXMAN_FORMAT_BPP (dest_image->bits.format), + src_x, src_y, dest_x, dest_y, width, height); +} + +static void +mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ + PIXMAN_COMPOSITE_ARGS (info); + uint32_t *src, *src_line; + uint32_t *dst, *dst_line; + uint8_t *mask, *mask_line; + int src_stride, mask_stride, dst_stride; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); while (height--) { - src = srcLine; - srcLine += srcStride; - dst = dstLine; - dstLine += dstStride; - mask = maskLine; - maskLine += maskStride; + src = src_line; + src_line += src_stride; + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; w = width; @@ -3025,17 +3089,20 @@ fbCompositeOver_x888x8x8888mmx (pixman_op_t op, if (m) { - __m64 s = load8888 (*src | 0xff000000); + uint32_t ssrc = *src | 0xff000000; + __m64 s = load8888 (&ssrc); if (m == 0xff) - *dst = store8888 (s); + { + store8888 (dst, s); + } else { __m64 sa = expand_alpha (s); - __m64 vm = expand_alpha_rev (M64(m)); - __m64 vdest = in_over(s, sa, vm, load8888 (*dst)); + __m64 vm = expand_alpha_rev (to_m64 (m)); + __m64 vdest = in_over (s, sa, vm, load8888 (dst)); - *dst = store8888 (vdest); + store8888 (dst, vdest); } } @@ -3045,9 +3112,161 @@ fbCompositeOver_x888x8x8888mmx (pixman_op_t op, } } - _mm_empty(); + _mm_empty (); +} + +static const pixman_fast_path_t mmx_fast_paths[] = +{ + PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mmx_composite_over_n_8_0565 ), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mmx_composite_over_n_8_8888 ), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mmx_composite_over_n_8_8888 ), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mmx_composite_over_n_8_8888 ), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mmx_composite_over_n_8_8888 ), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mmx_composite_over_n_8888_0565_ca ), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mmx_composite_over_n_8888_0565_ca ), + PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, mmx_composite_over_pixbuf_8888 ), + PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, mmx_composite_over_pixbuf_8888 ), + PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, mmx_composite_over_pixbuf_0565 ), + PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, mmx_composite_over_pixbuf_8888 ), + PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, mmx_composite_over_pixbuf_8888 ), + PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, mmx_composite_over_pixbuf_0565 ), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, mmx_composite_over_x888_n_8888 ), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, mmx_composite_over_x888_n_8888 ), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, mmx_composite_over_x888_n_8888 ), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, mmx_composite_over_x888_n_8888 ), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mmx_composite_over_8888_n_8888 ), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mmx_composite_over_8888_n_8888 ), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, mmx_composite_over_8888_n_8888 ), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, mmx_composite_over_8888_n_8888 ), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, mmx_composite_over_x888_8_8888 ), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, mmx_composite_over_x888_8_8888 ), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, mmx_composite_over_x888_8_8888 ), + PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mmx_composite_over_n_8888 ), + PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mmx_composite_over_n_8888 ), + PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mmx_composite_over_n_0565 ), + PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, mmx_composite_over_n_0565 ), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), + + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, mmx_composite_over_8888_8888 ), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mmx_composite_over_8888_8888 ), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mmx_composite_over_8888_0565 ), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mmx_composite_over_8888_8888 ), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ), + + PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ), + PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ), + PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ), + PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ), + + PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, mmx_composite_src_n_8_8888 ), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mmx_composite_copy_area ), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mmx_composite_copy_area ), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mmx_composite_copy_area ), + PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mmx_composite_copy_area ), + + PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ), + PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ), + + { PIXMAN_OP_NONE }, +}; + +static pixman_bool_t +mmx_blt (pixman_implementation_t *imp, + uint32_t * src_bits, + uint32_t * dst_bits, + int src_stride, + int dst_stride, + int src_bpp, + int dst_bpp, + int src_x, + int src_y, + int dest_x, + int dest_y, + int width, + int height) +{ + if (!pixman_blt_mmx ( + src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp, + src_x, src_y, dest_x, dest_y, width, height)) + + { + return _pixman_implementation_blt ( + imp->delegate, + src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp, + src_x, src_y, dest_x, dest_y, width, height); + } + + return TRUE; +} + +static pixman_bool_t +mmx_fill (pixman_implementation_t *imp, + uint32_t * bits, + int stride, + int bpp, + int x, + int y, + int width, + int height, + uint32_t xor) +{ + if (!pixman_fill_mmx (bits, stride, bpp, x, y, width, height, xor)) + { + return _pixman_implementation_fill ( + imp->delegate, bits, stride, bpp, x, y, width, height, xor); + } + + return TRUE; } +pixman_implementation_t * +_pixman_implementation_create_mmx (pixman_implementation_t *fallback) +{ + pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths); + + imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u; + imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u; + imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u; + imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u; + imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u; + imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u; + imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u; + imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u; + imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u; + imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u; + imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u; + imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca; + imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca; + imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca; + imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca; + imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca; + imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca; + imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca; + + imp->blt = mmx_blt; + imp->fill = mmx_fill; + + return imp; +} -#endif /* USE_MMX */ +#endif /* USE_X86_MMX || USE_ARM_IWMMXT */