From 33d4028e3fffa231f40d66b5843de589ec2642fe Mon Sep 17 00:00:00 2001 From: root Date: Sun, 1 Jul 2007 11:42:49 +0000 Subject: [PATCH] First import of vmx --- configure.ac | 35 ++ pixman/Makefile.am | 12 + pixman/pixman-pict.c | 55 +++ pixman/pixman-vmx.c | 1068 ++++++++++++++++++++++++++++++++++++++++++++++++++ pixman/pixman-vmx.h | 308 +++++++++++++++ 5 files changed, 1478 insertions(+) create mode 100644 pixman/pixman-vmx.c create mode 100644 pixman/pixman-vmx.h diff --git a/configure.ac b/configure.ac index b759c7f..81e2a26 100644 --- a/configure.ac +++ b/configure.ac @@ -76,6 +76,41 @@ AM_CONDITIONAL(USE_MMX, test $have_mmx_intrinsics = yes) dnl ======================================================== +dnl Check for VMX/Altivec +if test -n "`$CC -v 2>&1 | grep version | grep Apple`"; then + VMX_CFLAGS="-faltivec" +else + VMX_CFLAGS="-maltivec -mabi=altivec" +fi + +have_vmx_intrinsics=no +AC_MSG_CHECKING(For VMX/Altivec intrinsics in the compiler) +xserver_save_CFLAGS=$CFLAGS +CFLAGS="$CFLAGS $VMX_CFLAGS" +AC_COMPILE_IFELSE([ +#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4)) +#error "Need GCC >= 3.4 for sane altivec support" +#endif +#include +int main () { + vector unsigned int v = vec_splat_u32 (1); + v = vec_sub (v, v); + return 0; +}], have_vmx_intrinsics=yes) +CFLAGS=$xserver_save_CFLAGS +AC_MSG_RESULT($have_vmx_intrinsics) + +if test $have_vmx_intrinsics = yes ; then + AC_DEFINE(USE_VMX, 1, [use VMX compiler intrinsics]) +else + VMX_CFLAGS= +fi +AC_SUBST(VMX_CFLAGS) + +AM_CONDITIONAL(USE_VMX, test $have_vmx_intrinsics = yes) + +dnl =========================================================================== + PKG_CHECK_MODULES(GTK, [gtk+-2.0], [HAVE_GTK=yes], [HAVE_GTK=no]) AM_CONDITIONAL(HAVE_GTK, [test "x$HAVE_GTK" = xyes]) diff --git a/pixman/Makefile.am b/pixman/Makefile.am index 90c6436..e60c4eb 100644 --- a/pixman/Makefile.am +++ b/pixman/Makefile.am @@ -36,3 +36,15 @@ libpixman_mmx_la_CFLAGS = $(DEP_CFLAGS) $(MMX_CFLAGS) libpixman_mmx_la_LIBADD = $(DEP_LIBS) libpixman_la_LIBADD += libpixman-mmx.la endif + +# vmx code +if USE_VMX +noinst_LTLIBRARIES += libpixman-vmx.la +libpixman_vmx_la_SOURCES = \ + pixman-vmx.c \ + pixman-vmx.h +libpixman_vmx_la_CFLAGS = $(DEP_CFLAGS) $(VMX_CFLAGS) +libpixman_vmx_la_LIBADD = $(DEP_LIBS) +libpixman_la_LIBADD += libpixman-vmx.la +endif + diff --git a/pixman/pixman-pict.c b/pixman/pixman-pict.c index cad11dd..a857de5 100644 --- a/pixman/pixman-pict.c +++ b/pixman/pixman-pict.c @@ -29,6 +29,7 @@ #include "pixman.h" #include "pixman-private.h" #include "pixman-mmx.h" +#include "pixman-vmx.h" #define FbFullMask(n) ((n) == 32 ? (uint32_t)-1 : ((((uint32_t) 1) << n) - 1)) @@ -1416,6 +1417,13 @@ pixman_image_composite (pixman_op_t op, mmx_setup = TRUE; } #endif +#ifdef USE_VMX + static pixman_bool_t vmx_setup = FALSE; + if (!vmx_setup) { + fbComposeSetupVMX(); + vmx_setup = TRUE; + } +#endif if (srcRepeat && srcTransform && pSrc->bits.width == 1 && @@ -2062,6 +2070,53 @@ pixman_image_composite (pixman_op_t op, } +#ifdef USE_VMX +/* The CPU detection code needs to be in a file not compiled with + * "-maltivec -mabi=altivec", as gcc would try to save vector register + * across function calls causing SIGILL on cpus without Altivec/vmx. + */ +#ifdef __APPLE__ +#include + +pixman_bool_t pixman_have_vmx (void) { + int hasVMX = 0; + size_t length = sizeof ( hasVMX ); + int error = sysctlbyname ("hw.optional.altivec", &hasVMX, &length, NULL, 0); + if ( 0 != error ) return 0; + return hasVMX; +} + +#else +#include +#include + +static sigjmp_buf jmp; +static volatile sig_atomic_t in_test = 0; + +static void vmx_test (int sig) { + if (! in_test) { + signal (sig, SIG_DFL); + raise (sig); + } + in_test = 0; + siglongjmp (jmp, 1); +} + +pixman_bool_t pixman_have_vmx (void) { + signal (SIGILL, vmx_test); + if (sigsetjmp (jmp, 1)) { + signal (SIGILL, SIG_DFL); + } else { + in_test = 1; + asm volatile ( "vor 0, 0, 0" ); + signal (SIGILL, SIG_DFL); + return 1; + } + return 0; +} +#endif /* __APPLE__ */ +#endif /* USE_VMX */ + #ifdef USE_MMX /* The CPU detection code needs to be in a file not compiled with * "-mmmx -msse", as gcc would generate CMOV instructions otherwise diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c new file mode 100644 index 0000000..0008dc5 --- /dev/null +++ b/pixman/pixman-vmx.c @@ -0,0 +1,1068 @@ +/* + * Copyright © 2007 Luca Barbato + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Luca Barbato not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission. Luca Barbato makes no representations about the + * suitability of this software for any purpose. It is provided "as is" + * without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author: Luca Barbato (lu_zero@gentoo.org) + * + * Based on fbmmx.c by Owen Taylor, Søren Sandmann and Nicholas Miell + */ + +#include +#include "pixman-vmx.h" +#include + +static inline vector unsigned int +splat_alpha (vector unsigned int pix) { + return vec_perm (pix, pix, + (vector unsigned char)AVV(0x00,0x00,0x00,0x00, 0x04,0x04,0x04,0x04, + 0x08,0x08,0x08,0x08, 0x0C,0x0C,0x0C,0x0C)); +} + +static inline vector unsigned int +pix_multiply (vector unsigned int p, vector unsigned int a) +{ + vector unsigned short hi, lo, mod; + /* unpack to short */ + hi = (vector unsigned short) + vec_mergeh ((vector unsigned char)AVV(0), + (vector unsigned char)p); + mod = (vector unsigned short) + vec_mergeh ((vector unsigned char)AVV(0), + (vector unsigned char)a); + + hi = vec_mladd (hi, mod, (vector unsigned short) + AVV(0x0080,0x0080,0x0080,0x0080, + 0x0080,0x0080,0x0080,0x0080)); + + hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8))); + + hi = vec_sr (hi, vec_splat_u16 (8)); + + /* unpack to short */ + lo = (vector unsigned short) + vec_mergel ((vector unsigned char)AVV(0), + (vector unsigned char)p); + mod = (vector unsigned short) + vec_mergel ((vector unsigned char)AVV(0), + (vector unsigned char)a); + + lo = vec_mladd (lo, mod, (vector unsigned short) + AVV(0x0080,0x0080,0x0080,0x0080, + 0x0080,0x0080,0x0080,0x0080)); + + lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8))); + + lo = vec_sr (lo, vec_splat_u16 (8)); + + return (vector unsigned int)vec_packsu (hi, lo); +} + +static inline vector unsigned int +pix_add (vector unsigned int a, vector unsigned int b) +{ + return (vector unsigned int)vec_adds ((vector unsigned char)a, + (vector unsigned char)b); +} + +static inline vector unsigned int +pix_add_mul (vector unsigned int x, vector unsigned int a, + vector unsigned int y, vector unsigned int b) +{ + vector unsigned short hi, lo, mod, hiy, loy, mody; + + hi = (vector unsigned short) + vec_mergeh ((vector unsigned char)AVV(0), + (vector unsigned char)x); + mod = (vector unsigned short) + vec_mergeh ((vector unsigned char)AVV(0), + (vector unsigned char)a); + hiy = (vector unsigned short) + vec_mergeh ((vector unsigned char)AVV(0), + (vector unsigned char)y); + mody = (vector unsigned short) + vec_mergeh ((vector unsigned char)AVV(0), + (vector unsigned char)b); + + hi = vec_mladd (hi, mod, (vector unsigned short) + AVV(0x0080,0x0080,0x0080,0x0080, + 0x0080,0x0080,0x0080,0x0080)); + + hi = vec_mladd (hiy, mody, hi); + + hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8))); + + hi = vec_sr (hi, vec_splat_u16 (8)); + + lo = (vector unsigned short) + vec_mergel ((vector unsigned char)AVV(0), + (vector unsigned char)x); + mod = (vector unsigned short) + vec_mergel ((vector unsigned char)AVV(0), + (vector unsigned char)a); + + loy = (vector unsigned short) + vec_mergel ((vector unsigned char)AVV(0), + (vector unsigned char)y); + mody = (vector unsigned short) + vec_mergel ((vector unsigned char)AVV(0), + (vector unsigned char)b); + + lo = vec_mladd (lo, mod, (vector unsigned short) + AVV(0x0080,0x0080,0x0080,0x0080, + 0x0080,0x0080,0x0080,0x0080)); + + lo = vec_mladd (loy, mody, lo); + + lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8))); + + lo = vec_sr (lo, vec_splat_u16 (8)); + + return (vector unsigned int)vec_packsu (hi, lo); +} + +static inline vector unsigned int +negate (vector unsigned int src) +{ + return vec_nor (src, src); +} +/* dest*~srca + src */ +static inline vector unsigned int +over (vector unsigned int src, vector unsigned int srca, + vector unsigned int dest) +{ + vector unsigned char tmp = (vector unsigned char) + pix_multiply (dest, negate (srca)); + tmp = vec_adds ((vector unsigned char)src, tmp); + return (vector unsigned int)tmp; +} + +/* in == pix_multiply */ +#define in_over(src, srca, mask, dest) over (pix_multiply (src, mask),\ + pix_multiply (srca, mask), dest) + + +#define COMPUTE_SHIFT_MASK(source) \ + source ## _mask = vec_lvsl (0, source); + +#define COMPUTE_SHIFT_MASKS(dest, source) \ + dest ## _mask = vec_lvsl (0, dest); \ + source ## _mask = vec_lvsl (0, source); \ + store_mask = vec_lvsr (0, dest); + +#define COMPUTE_SHIFT_MASKC(dest, source, mask) \ + mask ## _mask = vec_lvsl (0, mask); \ + dest ## _mask = vec_lvsl (0, dest); \ + source ## _mask = vec_lvsl (0, source); \ + store_mask = vec_lvsr (0, dest); + +/* notice you have to declare temp vars... + * Note: tmp3 and tmp4 must remain untouched! + */ +#define LOAD_VECTOR (source) \ + tmp1 = (typeof(v ## source))vec_ld(0, source); \ + tmp2 = (typeof(v ## source))vec_ld(15, source); \ + v ## source = (typeof(v ## source)) \ + vec_perm(tmp1, tmp2, source ## _mask); + +#define LOAD_VECTORS(dest, source) \ + tmp1 = (typeof(tmp1))vec_ld(0, source); \ + tmp2 = (typeof(tmp2))vec_ld(15, source); \ + tmp3 = (typeof(tmp3))vec_ld(0, dest); \ + v ## source = (typeof(v ## source)) \ + vec_perm(tmp1, tmp2, source ## _mask); \ + tmp4 = (typeof(tmp4))vec_ld(15, dest); \ + v ## dest = (typeof(v ## dest)) \ + vec_perm(tmp3, tmp4, dest ## _mask); + +#define LOAD_VECTORSC(dest, source, mask) \ + tmp1 = (typeof(tmp1))vec_ld(0, source); \ + tmp2 = (typeof(tmp2))vec_ld(15, source); \ + tmp3 = (typeof(tmp3))vec_ld(0, dest); \ + v ## source = (typeof(v ## source)) \ + vec_perm(tmp1, tmp2, source ## _mask); \ + tmp4 = (typeof(tmp4))vec_ld(15, dest); \ + tmp1 = (typeof(tmp1))vec_ld(0, mask); \ + v ## dest = (typeof(v ## dest)) \ + vec_perm(tmp3, tmp4, dest ## _mask); \ + tmp2 = (typeof(tmp2))vec_ld(15, mask); \ + v ## mask = (typeof(v ## mask)) \ + vec_perm(tmp1, tmp2, mask ## _mask); +#define STORE_VECTOR(dest) \ + edges = vec_perm (tmp4, tmp3, dest ## _mask); \ + tmp3 = vec_perm ((vector unsigned char)v ## dest, edges, store_mask); \ + tmp1 = vec_perm (edges, (vector unsigned char)v ## dest, store_mask); \ + vec_st ((vector unsigned int) tmp3, 15, dest ); \ + vec_st ((vector unsigned int) tmp1, 0, dest ); + +static FASTCALL void +vmxCombineMaskU (uint32_t *src, const uint32_t *msk, int width) +{ + int i; + vector unsigned int vsrc, vmsk; + vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, + src_mask, msk_mask, store_mask; + + COMPUTE_SHIFT_MASKS(src, msk) + + /* printf ("%s\n",__PRETTY_FUNCTION__); */ + for (i = width/4; i > 0; i--) { + + LOAD_VECTORS(src, msk) + + vsrc = pix_multiply (vsrc, splat_alpha (vmsk)); + + STORE_VECTOR(src) + + msk+=4; + src+=4; + } + + for (i = width%4; --i >= 0;) { + uint32_t a = msk[i] >> 24; + uint32_t s = src[i]; + FbByteMul (s, a); + src[i] = s; + } +} + +static FASTCALL void +vmxCombineOverU (uint32_t *dest, const uint32_t *src, int width) +{ + int i; + vector unsigned int vdest, vsrc; + vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, + dest_mask, src_mask, store_mask; + + COMPUTE_SHIFT_MASKS(dest, src) + + /* printf ("%s\n",__PRETTY_FUNCTION__); */ + for (i = width/4; i > 0; i--) { + + LOAD_VECTORS(dest, src) + + vdest = over (vsrc, splat_alpha (vsrc), vdest); + + STORE_VECTOR(dest) + + src+=4; + dest+=4; + } + + for (i = width%4; --i >=0;) { + uint32_t s = src[i]; + uint32_t d = dest[i]; + uint32_t ia = Alpha (~s); + + FbByteMulAdd (d, ia, s); + dest[i] = d; + } +} + + +static FASTCALL void +vmxCombineOverReverseU (uint32_t *dest, const uint32_t *src, int width) +{ + int i; + vector unsigned int vdest, vsrc; + vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, + dest_mask, src_mask, store_mask; + + COMPUTE_SHIFT_MASKS(dest, src) + + /* printf ("%s\n",__PRETTY_FUNCTION__); */ + for (i = width/4; i > 0; i--) { + + LOAD_VECTORS(dest, src) + + vdest = over (vdest, splat_alpha (vdest) , vsrc); + + STORE_VECTOR(dest) + + src+=4; + dest+=4; + } + + for (i = width%4; --i >=0;) { + uint32_t s = src[i]; + uint32_t d = dest[i]; + uint32_t ia = Alpha (~dest[i]); + + FbByteMulAdd (s, ia, d); + dest[i] = s; + } +} + +static FASTCALL void +vmxCombineInU (uint32_t *dest, const uint32_t *src, int width) +{ + int i; + vector unsigned int vdest, vsrc; + vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, + dest_mask, src_mask, store_mask; + + COMPUTE_SHIFT_MASKS(dest, src) + + /* printf ("%s\n",__PRETTY_FUNCTION__); */ + for (i = width/4; i > 0; i--) { + + LOAD_VECTORS(dest, src) + + vdest = pix_multiply (vsrc, splat_alpha (vdest)); + + STORE_VECTOR(dest) + + src+=4; + dest+=4; + } + + for (i = width%4; --i >=0;) { + + uint32_t s = src[i]; + uint32_t a = Alpha (dest[i]); + FbByteMul (s, a); + dest[i] = s; + } +} + +static FASTCALL void +vmxCombineInReverseU (uint32_t *dest, const uint32_t *src, int width) +{ + int i; + vector unsigned int vdest, vsrc; + vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, + dest_mask, src_mask, store_mask; + + COMPUTE_SHIFT_MASKS(dest, src) + + /* printf ("%s\n",__PRETTY_FUNCTION__); */ + for (i = width/4; i > 0; i--) { + + LOAD_VECTORS(dest, src) + + vdest = pix_multiply (vdest, splat_alpha (vsrc)); + + STORE_VECTOR(dest) + + src+=4; + dest+=4; + } + + for (i = width%4; --i >=0;) { + uint32_t d = dest[i]; + uint32_t a = Alpha (src[i]); + FbByteMul (d, a); + dest[i] = d; + } +} + +static FASTCALL void +vmxCombineOutU (uint32_t *dest, const uint32_t *src, int width) +{ + int i; + vector unsigned int vdest, vsrc; + vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, + dest_mask, src_mask, store_mask; + + COMPUTE_SHIFT_MASKS(dest, src) + + /* printf ("%s\n",__PRETTY_FUNCTION__); */ + for (i = width/4; i > 0; i--) { + + LOAD_VECTORS(dest, src) + + vdest = pix_multiply (vsrc, splat_alpha (negate (vdest))); + + STORE_VECTOR(dest) + + src+=4; + dest+=4; + } + + for (i = width%4; --i >=0;) { + uint32_t s = src[i]; + uint32_t a = Alpha (~dest[i]); + FbByteMul (s, a); + dest[i] = s; + } +} + +static FASTCALL void +vmxCombineOutReverseU (uint32_t *dest, const uint32_t *src, int width) +{ + int i; + vector unsigned int vdest, vsrc; + vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, + dest_mask, src_mask, store_mask; + + COMPUTE_SHIFT_MASKS(dest, src) + + /* printf ("%s\n",__PRETTY_FUNCTION__); */ + for (i = width/4; i > 0; i--) { + + LOAD_VECTORS(dest, src) + + vdest = pix_multiply (vdest, splat_alpha (negate (vsrc))); + + STORE_VECTOR(dest) + + src+=4; + dest+=4; + } + + for (i = width%4; --i >=0;) { + uint32_t d = dest[i]; + uint32_t a = Alpha (~src[i]); + FbByteMul (d, a); + dest[i] = d; + } +} + +static FASTCALL void +vmxCombineAtopU (uint32_t *dest, const uint32_t *src, int width) +{ + int i; + vector unsigned int vdest, vsrc; + vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, + dest_mask, src_mask, store_mask; + + COMPUTE_SHIFT_MASKS(dest, src) + + /* printf ("%s\n",__PRETTY_FUNCTION__); */ + for (i = width/4; i > 0; i--) { + + LOAD_VECTORS(dest, src) + + vdest = pix_add_mul (vsrc, splat_alpha (vdest), + vdest, splat_alpha (negate (vsrc))); + + STORE_VECTOR(dest) + + src+=4; + dest+=4; + } + + for (i = width%4; --i >=0;) { + uint32_t s = src[i]; + uint32_t d = dest[i]; + uint32_t dest_a = Alpha (d); + uint32_t src_ia = Alpha (~s); + + FbByteAddMul (s, dest_a, d, src_ia); + dest[i] = s; + } +} + +static FASTCALL void +vmxCombineAtopReverseU (uint32_t *dest, const uint32_t *src, int width) +{ + int i; + vector unsigned int vdest, vsrc; + vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, + dest_mask, src_mask, store_mask; + + COMPUTE_SHIFT_MASKS(dest, src) + + /* printf ("%s\n",__PRETTY_FUNCTION__); */ + for (i = width/4; i > 0; i--) { + + LOAD_VECTORS(dest, src) + + vdest = pix_add_mul (vdest, splat_alpha (vsrc), + vsrc, splat_alpha (negate (vdest))); + + STORE_VECTOR(dest) + + src+=4; + dest+=4; + } + + for (i = width%4; --i >=0;) { + uint32_t s = src[i]; + uint32_t d = dest[i]; + uint32_t src_a = Alpha (s); + uint32_t dest_ia = Alpha (~d); + + FbByteAddMul (s, dest_ia, d, src_a); + dest[i] = s; + } +} + +static FASTCALL void +vmxCombineXorU (uint32_t *dest, const uint32_t *src, int width) +{ + int i; + vector unsigned int vdest, vsrc; + vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, + dest_mask, src_mask, store_mask; + + COMPUTE_SHIFT_MASKS(dest, src) + + /* printf ("%s\n",__PRETTY_FUNCTION__); */ + for (i = width/4; i > 0; i--) { + + LOAD_VECTORS (dest, src) + + vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)), + vdest, splat_alpha (negate (vsrc))); + + STORE_VECTOR(dest) + + src+=4; + dest+=4; + } + + for (i = width%4; --i >=0;) { + uint32_t s = src[i]; + uint32_t d = dest[i]; + uint32_t src_ia = Alpha (~s); + uint32_t dest_ia = Alpha (~d); + + FbByteAddMul (s, dest_ia, d, src_ia); + dest[i] = s; + } +} + +static FASTCALL void +vmxCombineAddU (uint32_t *dest, const uint32_t *src, int width) +{ + int i; + vector unsigned int vdest, vsrc; + vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, + dest_mask, src_mask, store_mask; + + COMPUTE_SHIFT_MASKS(dest, src) + /* printf ("%s\n",__PRETTY_FUNCTION__); */ + for (i = width/4; i > 0; i--) { + + LOAD_VECTORS(dest, src) + + vdest = pix_add (vsrc, vdest); + + STORE_VECTOR(dest) + + src+=4; + dest+=4; + } + + for (i = width%4; --i >=0;) { + uint32_t s = src[i]; + uint32_t d = dest[i]; + FbByteAdd (d, s); + dest[i] = d; + } +} + +static FASTCALL void +vmxCombineSrcC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width) +{ + int i; + vector unsigned int vdest, vsrc, vmask; + vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, + dest_mask, mask_mask, src_mask, store_mask; + + COMPUTE_SHIFT_MASKC(dest, src, mask); + /* printf ("%s\n",__PRETTY_FUNCTION__); */ + for (i = width/4; i > 0; i--) { + + LOAD_VECTORSC(dest, src, mask) + + vdest = pix_multiply (vsrc, vmask); + + STORE_VECTOR(dest) + + mask+=4; + src+=4; + dest+=4; + } + + for (i = width%4; --i >=0;) { + uint32_t a = mask[i]; + uint32_t s = src[i]; + FbByteMulC (s, a); + dest[i] = s; + } +} + +static FASTCALL void +vmxCombineOverC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width) +{ + int i; + vector unsigned int vdest, vsrc, vmask; + vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, + dest_mask, mask_mask, src_mask, store_mask; + + COMPUTE_SHIFT_MASKC(dest, src, mask); + /* printf ("%s\n",__PRETTY_FUNCTION__); */ + for (i = width/4; i > 0; i--) { + + LOAD_VECTORSC(dest, src, mask) + + vdest = in_over (vsrc, splat_alpha (vsrc), vmask, vdest); + + STORE_VECTOR(dest) + + mask+=4; + src+=4; + dest+=4; + } + + for (i = width%4; --i >=0;) { + uint32_t a = mask[i]; + uint32_t s = src[i]; + uint32_t d = dest[i]; + FbByteMulC (s, a); + FbByteMulAddC (d, ~a, s); + dest[i] = d; + } +} + +static FASTCALL void +vmxCombineOverReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width) +{ + int i; + vector unsigned int vdest, vsrc, vmask; + vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, + dest_mask, mask_mask, src_mask, store_mask; + + COMPUTE_SHIFT_MASKC(dest, src, mask); + /* printf("%s\n",__PRETTY_FUNCTION__); */ + for (i = width/4; i > 0; i--) { + + LOAD_VECTORSC (dest, src, mask) + + vdest = over (vdest, splat_alpha (vdest), pix_multiply (vsrc, vmask)); + + STORE_VECTOR(dest) + + mask+=4; + src+=4; + dest+=4; + } + + for (i = width%4; --i >=0;) { + uint32_t a = mask[i]; + uint32_t s = src[i]; + uint32_t d = dest[i]; + uint32_t da = Alpha (d); + FbByteMulC (s, a); + FbByteMulAddC (s, ~da, d); + dest[i] = s; + } +} + +static FASTCALL void +vmxCombineInC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width) +{ + int i; + vector unsigned int vdest, vsrc, vmask; + vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, + dest_mask, mask_mask, src_mask, store_mask; + + COMPUTE_SHIFT_MASKC(dest, src, mask) + + /* printf ("%s\n",__PRETTY_FUNCTION__); */ + for (i = width/4; i > 0; i--) { + + LOAD_VECTORSC(dest, src, mask) + + vdest = pix_multiply (pix_multiply (vsrc, vmask), splat_alpha (vdest)); + + STORE_VECTOR(dest) + + src+=4; + dest+=4; + mask+=4; + } + + for (i = width%4; --i >=0;) { + uint32_t a = mask[i]; + uint32_t s = src[i]; + uint32_t da = Alpha (dest[i]); + FbByteMul (s, a); + FbByteMul (s, da); + dest[i] = s; + } +} + +static FASTCALL void +vmxCombineInReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width) +{ + int i; + vector unsigned int vdest, vsrc, vmask; + vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, + dest_mask, mask_mask, src_mask, store_mask; + + COMPUTE_SHIFT_MASKC(dest, src, mask) + + /* printf ("%s\n",__PRETTY_FUNCTION__); */ + for (i = width/4; i > 0; i--) { + + LOAD_VECTORSC(dest, src, mask) + + vdest = pix_multiply (vdest, pix_multiply (vmask, splat_alpha (vsrc))); + + STORE_VECTOR(dest) + + src+=4; + dest+=4; + mask+=4; + } + + for (i = width%4; --i >=0;) { + uint32_t a = mask[i]; + uint32_t d = dest[i]; + uint32_t sa = Alpha (src[i]); + FbByteMul (a, sa); + FbByteMulC (d, a); + dest[i] = d; + } +} + +static FASTCALL void +vmxCombineOutC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width) +{ + int i; + vector unsigned int vdest, vsrc, vmask; + vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, + dest_mask, mask_mask, src_mask, store_mask; + + COMPUTE_SHIFT_MASKC(dest, src, mask) + + /* printf ("%s\n",__PRETTY_FUNCTION__); */ + for (i = width/4; i > 0; i--) { + + LOAD_VECTORSC(dest, src, mask) + + vdest = pix_multiply (pix_multiply (vsrc, vmask), splat_alpha (vdest)); + + STORE_VECTOR(dest) + + src+=4; + dest+=4; + mask+=4; + } + + for (i = width%4; --i >=0;) { + uint32_t a = mask[i]; + uint32_t s = src[i]; + uint32_t d = dest[i]; + uint32_t da = Alpha (~d); + FbByteMulC (s, a); + FbByteMulC (s, da); + dest[i] = s; + } +} + +static FASTCALL void +vmxCombineOutReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width) +{ + int i; + vector unsigned int vdest, vsrc, vmask; + vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, + dest_mask, mask_mask, src_mask, store_mask; + + COMPUTE_SHIFT_MASKC(dest, src, mask) + + /* printf ("%s\n",__PRETTY_FUNCTION__); */ + for (i = width/4; i > 0; i--) { + + LOAD_VECTORSC(dest, src, mask) + + vdest = pix_multiply (vdest, + negate (pix_multiply (vmask, splat_alpha (vsrc)))); + + STORE_VECTOR(dest) + + src+=4; + dest+=4; + mask+=4; + } + + for (i = width%4; --i >=0;) { + uint32_t a = mask[i]; + uint32_t s = src[i]; + uint32_t d = dest[i]; + uint32_t sa = Alpha (s); + FbByteMulC (a, sa); + FbByteMulC (d, ~a); + dest[i] = d; + } +} + +static FASTCALL void +vmxCombineAtopC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width) +{ + int i; + vector unsigned int vdest, vsrc, vmask; + vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, + dest_mask, mask_mask, src_mask, store_mask; + + COMPUTE_SHIFT_MASKC(dest, src, mask) + + /* printf ("%s\n",__PRETTY_FUNCTION__); */ + for (i = width/4; i > 0; i--) { + + LOAD_VECTORSC(dest, src, mask) + + vdest = pix_add_mul (pix_multiply (vsrc, vmask), splat_alpha (vdest), + vdest, + negate (pix_multiply (vmask, + splat_alpha (vmask)))); + + STORE_VECTOR(dest) + + src+=4; + dest+=4; + mask+=4; + } + + for (i = width%4; --i >=0;) { + uint32_t a = mask[i]; + uint32_t s = src[i]; + uint32_t d = dest[i]; + uint32_t sa = Alpha (s); + uint32_t da = Alpha (d); + + FbByteMulC (s, a); + FbByteMul (a, sa); + FbByteAddMulC (d, ~a, s, da); + dest[i] = d; + } +} + +static FASTCALL void +vmxCombineAtopReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width) +{ + int i; + vector unsigned int vdest, vsrc, vmask; + vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, + dest_mask, mask_mask, src_mask, store_mask; + + COMPUTE_SHIFT_MASKC(dest, src, mask) + + /* printf ("%s\n",__PRETTY_FUNCTION__); */ + for (i = width/4; i > 0; i--) { + + LOAD_VECTORSC(dest, src, mask) + + vdest = pix_add_mul (vdest, + pix_multiply (vmask, splat_alpha (vsrc)), + pix_multiply (vsrc, vmask), + negate (splat_alpha (vdest))); + + STORE_VECTOR(dest) + + src+=4; + dest+=4; + mask+=4; + } + + for (i = width%4; --i >=0;) { + uint32_t a = mask[i]; + uint32_t s = src[i]; + uint32_t d = dest[i]; + uint32_t sa = Alpha (s); + uint32_t da = Alpha (d); + + FbByteMulC (s, a); + FbByteMul (a, sa); + FbByteAddMulC (d, a, s, ~da); + dest[i] = d; + } +} + +static FASTCALL void +vmxCombineXorC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width) +{ + int i; + vector unsigned int vdest, vsrc, vmask; + vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, + dest_mask, mask_mask, src_mask, store_mask; + + COMPUTE_SHIFT_MASKC(dest, src, mask) + + /* printf ("%s\n",__PRETTY_FUNCTION__); */ + for (i = width/4; i > 0; i--) { + + LOAD_VECTORSC(dest, src, mask) + + vdest = pix_add_mul (vdest, + negate (pix_multiply (vmask, splat_alpha (vsrc))), + pix_multiply (vsrc, vmask), + negate (splat_alpha (vdest))); + + STORE_VECTOR(dest) + + src+=4; + dest+=4; + mask+=4; + } + + for (i = width%4; --i >=0;) { + uint32_t a = mask[i]; + uint32_t s = src[i]; + uint32_t d = dest[i]; + uint32_t sa = Alpha (s); + uint32_t da = Alpha (d); + + FbByteMulC (s, a); + FbByteMul (a, sa); + FbByteAddMulC (d, ~a, s, ~da); + dest[i] = d; + } +} + +static FASTCALL void +vmxCombineAddC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width) +{ + int i; + vector unsigned int vdest, vsrc, vmask; + vector unsigned char tmp1, tmp2, tmp3, tmp4, edges, + dest_mask, mask_mask, src_mask, store_mask; + + COMPUTE_SHIFT_MASKC(dest, src, mask) + + /* printf ("%s\n",__PRETTY_FUNCTION__); */ + for (i = width/4; i > 0; i--) { + + LOAD_VECTORSC(dest, src, mask) + + vdest = pix_add (pix_multiply (vsrc, vmask), vdest); + + STORE_VECTOR(dest) + + src+=4; + dest+=4; + mask+=4; + } + + for (i = width%4; --i >=0;) { + uint32_t a = mask[i]; + uint32_t s = src[i]; + uint32_t d = dest[i]; + + FbByteMulC (s, a); + FbByteAdd (s, d); + dest[i] = s; + } +} + + +#if 0 +void +fbCompositeSolid_nx8888vmx (pixman_operator_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height) +{ + uint32_t src; + uint32_t *dstLine, *dst; + int dstStride; + + fbComposeGetSolid (pSrc, pDst, src); + + if (src >> 24 == 0) + return; + + fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + /* XXX vmxCombineOverU (dst, src, width); */ + } +} + +void +fbCompositeSolid_nx0565vmx (pixman_operator_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height) +{ + uint32_t src; + uint16_t *dstLine, *dst; + uint16_t w; + int dstStride; + + fbComposeGetSolid (pSrc, pDst, src); + + if (src >> 24 == 0) + return; + + fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + vmxCombineOverU565(dst, src, width); + } +} + +#endif + +void fbComposeSetupVMX (void) +{ + /* check if we have VMX support and initialize accordingly */ + if (pixman_have_vmx ()) { + pixman_composeFunctions.combineU[PIXMAN_OP_OVER] = vmxCombineOverU; + pixman_composeFunctions.combineU[PIXMAN_OP_OVER_REVERSE] = vmxCombineOverReverseU; + pixman_composeFunctions.combineU[PIXMAN_OP_IN] = vmxCombineInU; + pixman_composeFunctions.combineU[PIXMAN_OP_IN_REVERSE] = vmxCombineInReverseU; + pixman_composeFunctions.combineU[PIXMAN_OP_OUT] = vmxCombineOutU; + pixman_composeFunctions.combineU[PIXMAN_OP_OUT_REVERSE] = vmxCombineOutReverseU; + pixman_composeFunctions.combineU[PIXMAN_OP_ATOP] = vmxCombineAtopU; + pixman_composeFunctions.combineU[PIXMAN_OP_ATOP_REVERSE] = vmxCombineAtopReverseU; + pixman_composeFunctions.combineU[PIXMAN_OP_XOR] = vmxCombineXorU; + pixman_composeFunctions.combineU[PIXMAN_OP_ADD] = vmxCombineAddU; + + pixman_composeFunctions.combineC[PIXMAN_OP_SRC] = vmxCombineSrcC; + pixman_composeFunctions.combineC[PIXMAN_OP_OVER] = vmxCombineOverC; + pixman_composeFunctions.combineC[PIXMAN_OP_OVER_REVERSE] = vmxCombineOverReverseC; + pixman_composeFunctions.combineC[PIXMAN_OP_IN] = vmxCombineInC; + pixman_composeFunctions.combineC[PIXMAN_OP_IN_REVERSE] = vmxCombineInReverseC; + pixman_composeFunctions.combineC[PIXMAN_OP_OUT] = vmxCombineOutC; + pixman_composeFunctions.combineC[PIXMAN_OP_OUT_REVERSE] = vmxCombineOutReverseC; + pixman_composeFunctions.combineC[PIXMAN_OP_ATOP] = vmxCombineAtopC; + pixman_composeFunctions.combineC[PIXMAN_OP_ATOP_REVERSE] = vmxCombineAtopReverseC; + pixman_composeFunctions.combineC[PIXMAN_OP_XOR] = vmxCombineXorC; + pixman_composeFunctions.combineC[PIXMAN_OP_ADD] = vmxCombineAddC; + + pixman_composeFunctions.combineMaskU = vmxCombineMaskU; + } +} diff --git a/pixman/pixman-vmx.h b/pixman/pixman-vmx.h new file mode 100644 index 0000000..70cb53a --- /dev/null +++ b/pixman/pixman-vmx.h @@ -0,0 +1,308 @@ +/* + * Copyright © 2007 Luca Barbato + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Luca Barbato not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission. Luca Barbato makes no representations about the + * suitability of this software for any purpose. It is provided "as is" + * without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author: Luca Barbato (lu_zero@gentoo.org) + * + * Based on work by Owen Taylor, Søren Sandmann and Lars Knoll + */ + +#include "pixman-private.h" + +#ifdef USE_VMX + +pixman_bool_t pixman_have_vmx(void); + +#else +#define pixman_have_vmx() FALSE +#endif + +#ifdef USE_VMX + +#define AVV(x...) {x} + +void fbComposeSetupVMX (void); + +#if 0 +void fbCompositeIn_nx8x8vmx (pixman_operator_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height); + +void fbCompositeSolidMask_nx8888x0565Cvmx (pixman_operator_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height); + +void fbCompositeSrcAdd_8888x8888vmx (pixman_operator_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height); + +void fbCompositeSolidMask_nx8888x8888Cvmx (pixman_operator_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height); + +void fbCompositeSolidMask_nx8x8888vmx (pixman_operator_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height); + +void fbCompositeSolidMaskSrc_nx8x8888vmx (pixman_operator_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height); + +void fbCompositeSrcAdd_8888x8x8vmx (pixman_operator_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height); + +void fbCompositeIn_8x8vmx (pixman_operator_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height); + +void fbCompositeSrcAdd_8000x8000vmx (pixman_operator_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height); + +void fbCompositeSrc_8888RevNPx8888vmx (pixman_operator_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height); + +void fbCompositeSrc_8888x0565vmx (pixman_operator_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height); + +void fbCompositeSrc_8888RevNPx0565vmx (pixman_operator_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height); + +void fbCompositeSolid_nx8888vmx (pixman_operator_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height); + +void fbCompositeSolid_nx0565vmx (pixman_operator_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height); + +void fbCompositeSolidMask_nx8x0565vmx (pixman_operator_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height); + +void fbCompositeSrc_x888x8x8888vmx (pixman_operator_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height); + +void fbCompositeSrc_8888x8x8888vmx (pixman_operator_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height); + +void fbCompositeSrc_8888x8888vmx (pixman_operator_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height); + +pixman_bool_t fbCopyAreavmx (FbPixels *pSrc, + FbPixels *pDst, + int src_x, + int src_y, + int dst_x, + int dst_y, + int width, + int height); + +void fbCompositeCopyAreavmx (pixman_operator_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height); + +pixman_bool_t fbSolidFillvmx (FbPixels *pDraw, + int x, + int y, + int width, + int height, + FbBits xor); +#endif +#endif /* USE_VMX */ -- 2.7.4