From c78e986085b3993f1b4355151820228c53d54cad Mon Sep 17 00:00:00 2001 From: Matt Turner Date: Fri, 24 Feb 2012 12:43:43 -0500 Subject: [PATCH] mmx: compile on MIPS for Loongson MMI optimizations image image16 evolution 32.985 -> 29.667 27.314 -> 23.870 firefox-planet-gnome 197.982 -> 180.437 220.986 -> 205.057 gnome-system-monitor 48.482 -> 49.752 52.820 -> 49.528 gnome-terminal-vim 60.799 -> 50.528 51.655 -> 44.131 grads-heat-map 3.167 -> 3.181 3.328 -> 3.321 gvim 38.646 -> 32.552 38.126 -> 34.453 midori-zoomed 44.371 -> 43.338 28.860 -> 28.865 ocitysmap 23.065 -> 18.057 23.046 -> 18.055 poppler 43.676 -> 36.077 43.065 -> 36.090 swfdec-giant-steps 20.166 -> 20.365 22.354 -> 16.578 swfdec-youtube 31.502 -> 28.118 44.052 -> 41.771 xfce4-terminal-a1 69.517 -> 51.288 62.225 -> 53.309 --- configure.ac | 54 +++++++++++ pixman/Makefile.am | 12 +++ pixman/loongson-mmintrin.h | 218 +++++++++++++++++++++++++++++++++++++++++++++ pixman/pixman-cpu.c | 37 ++++++-- pixman/pixman-mmx.c | 40 ++++++++- pixman/pixman-private.h | 2 +- 6 files changed, 350 insertions(+), 13 deletions(-) create mode 100644 pixman/loongson-mmintrin.h diff --git a/configure.ac b/configure.ac index b8d59f6..5478734 100644 --- a/configure.ac +++ b/configure.ac @@ -271,6 +271,59 @@ PIXMAN_CHECK_CFLAG([-xldscope=hidden], [dnl ]) dnl =========================================================================== +dnl Check for Loongson Multimedia Instructions + +if test "x$LS_CFLAGS" = "x" ; then + LS_CFLAGS="-march=loongson2f" +fi + +have_loongson_mmi=no +AC_MSG_CHECKING(whether to use Loongson MMI) + +xserver_save_CFLAGS=$CFLAGS +CFLAGS=" $CFLAGS $LS_CFLAGS" +AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ +#ifndef __mips_loongson_vector_rev +#error "Loongson Multimedia Instructions are only available on Loongson" +#endif +#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 4)) +#error "Need GCC >= 4.4 for Loongson MMI compilation" +#endif +#include "pixman/loongson-mmintrin.h" +int main () { + union { + __m64 v; + char c[8]; + } a = { .c = {1, 2, 3, 4, 5, 6, 7, 8} }; + int b = 4; + __m64 c = _mm_srli_pi16 (a.v, b); + return 0; +}]])], have_loongson_mmi=yes) +CFLAGS=$xserver_save_CFLAGS + +AC_ARG_ENABLE(loongson, + [AC_HELP_STRING([--disable-loongson], + [disable Loongson fast paths])], + [enable_loongson=$enableval], [enable_loongson=auto]) + +if test $enable_loongson = no ; then + have_loongson_mmi=disabled +fi + +if test $have_loongson_mmi = yes ; then + AC_DEFINE(USE_LOONGSON_MMI, 1, [use Loongson Multimedia Instructions]) +else + LS_CFLAGS= +fi + +AC_MSG_RESULT($have_loongson_mmi) +if test $enable_loongson = yes && test $have_loongson_mmi = no ; then + AC_MSG_ERROR([Loongson MMI not detected]) +fi + +AM_CONDITIONAL(USE_LOONGSON_MMI, test $have_loongson_mmi = yes) + +dnl =========================================================================== dnl Check for MMX if test "x$MMX_CFLAGS" = "x" ; then @@ -416,6 +469,7 @@ case $host_os in ;; esac +AC_SUBST(LS_CFLAGS) AC_SUBST(IWMMXT_CFLAGS) AC_SUBST(MMX_CFLAGS) AC_SUBST(MMX_LDFLAGS) diff --git a/pixman/Makefile.am b/pixman/Makefile.am index fb7e047..b320a58 100644 --- a/pixman/Makefile.am +++ b/pixman/Makefile.am @@ -118,5 +118,17 @@ libpixman_1_la_LIBADD += libpixman-mips-dspr2.la ASM_CFLAGS_mips_dspr2= endif +# loongson code +if USE_LOONGSON_MMI +noinst_LTLIBRARIES += libpixman-loongson-mmi.la +libpixman_loongson_mmi_la_SOURCES = pixman-mmx.c loongson-mmintrin.h +libpixman_loongson_mmi_la_CFLAGS = $(DEP_CFLAGS) $(LS_CFLAGS) +libpixman_loongson_mmi_la_LIBADD = $(DEP_LIBS) +libpixman_1_la_LDFLAGS += $(LS_LDFLAGS) +libpixman_1_la_LIBADD += libpixman-loongson-mmi.la + +ASM_CFLAGS_ls=$(LS_CFLAGS) +endif + .c.s : $(libpixmaninclude_HEADERS) $(BUILT_SOURCES) $(CC) $(CFLAGS) $(ASM_CFLAGS_$(@:pixman-%.s=%)) $(ASM_CFLAGS_$(@:pixman-arm-%.s=arm_%)) -DHAVE_CONFIG_H -I$(srcdir) -I$(builddir) -I$(top_builddir) -S -o $@ $< diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h new file mode 100644 index 0000000..44d30f5 --- /dev/null +++ b/pixman/loongson-mmintrin.h @@ -0,0 +1,218 @@ +/* The gcc-provided loongson intrinsic functions are way too fucking broken + * to be of any use, otherwise I'd use them. + * + * - The hardware instructions are very similar to MMX or iwMMXt. Certainly + * close enough that they could have implemented the _mm_*-style intrinsic + * interface and had a ton of optimized code available to them. Instead they + * implemented something much, much worse. + * + * - pshuf takes a dead first argument, causing extra instructions to be + * generated. + * + * - There are no 64-bit shift or logical intrinsics, which means you have + * to implement them with inline assembly, but this is a nightmare because + * gcc doesn't understand that the integer vector datatypes are actually in + * floating-point registers, so you end up with braindead code like + * + * punpcklwd $f9,$f9,$f5 + * dmtc1 v0,$f8 + * punpcklwd $f19,$f19,$f5 + * dmfc1 t9,$f9 + * dmtc1 v0,$f9 + * dmtc1 t9,$f20 + * dmfc1 s0,$f19 + * punpcklbh $f20,$f20,$f2 + * + * where crap just gets copied back and forth between integer and floating- + * point registers ad nauseum. + * + * Instead of trying to workaround the problems from these crap intrinsics, I + * just implement the _mm_* intrinsics needed for pixman-mmx.c using inline + * assembly. + */ + +#include + +/* vectors are stored in 64-bit floating-point registers */ +typedef double __m64; +/* having a 32-bit datatype allows us to use 32-bit loads in places like load8888 */ +typedef float __m32; + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setzero_si64 (void) +{ + return 0.0; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_pu16 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("paddush %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_pu8 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("paddusb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_and_si64 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("and %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_empty (void) +{ + +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhi_pu16 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("pmulhuh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mullo_pi16 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("pmullh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_or_si64 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("or %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packs_pu16 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("packushb %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_pi16 (__m64 __m, int64_t __n) +{ + __m64 ret; + asm("pshufh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__n) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_si64 (__m64 __m, int64_t __count) +{ + __m64 ret; + asm("dsll %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_pi16 (__m64 __m, int64_t __count) +{ + __m64 ret; + asm("psrlh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_si64 (__m64 __m, int64_t __count) +{ + __m64 ret; + asm("dsrl %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("punpckhbh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("punpcklbh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +/* Since punpcklbh doesn't care about the high 32-bits, we use the __m32 datatype which + * allows load8888 to use 32-bit loads */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_pi8_f (__m32 __m1, __m64 __m2) +{ + __m64 ret; + asm("punpcklbh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_xor_si64 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("xor %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} diff --git a/pixman/pixman-cpu.c b/pixman/pixman-cpu.c index 1060f47..aa9036f 100644 --- a/pixman/pixman-cpu.c +++ b/pixman/pixman-cpu.c @@ -427,22 +427,19 @@ pixman_have_arm_iwmmxt (void) #endif /* USE_ARM_SIMD || USE_ARM_NEON || USE_ARM_IWMMXT */ -#if defined(USE_MIPS_DSPR2) +#if defined(USE_MIPS_DSPR2) || defined(USE_LOONGSON_MMI) #if defined (__linux__) /* linux ELF */ -pixman_bool_t -pixman_have_mips_dspr2 (void) +static pixman_bool_t +pixman_have_mips_feature (const char *search_string) { - const char *search_string = "MIPS 74K"; const char *file_name = "/proc/cpuinfo"; - /* Simple detection of MIPS DSP ASE (revision 2) at runtime for Linux. + /* Simple detection of MIPS features at runtime for Linux. * It is based on /proc/cpuinfo, which reveals hardware configuration * to user-space applications. According to MIPS (early 2010), no similar * facility is universally available on the MIPS architectures, so it's up * to individual OSes to provide such. - * - * Only currently available MIPS core that supports DSPr2 is 74K. */ char cpuinfo_line[256]; @@ -467,13 +464,32 @@ pixman_have_mips_dspr2 (void) return FALSE; } +#if defined(USE_MIPS_DSPR2) +pixman_bool_t +pixman_have_mips_dspr2 (void) +{ + /* Only currently available MIPS core that supports DSPr2 is 74K. */ + return pixman_have_mips_feature ("MIPS 74K"); +} +#endif + +#if defined(USE_LOONGSON_MMI) +pixman_bool_t +pixman_have_loongson_mmi (void) +{ + /* I really don't know if some Loongson CPUs don't have MMI. */ + return pixman_have_mips_feature ("Loongson"); +} +#endif + #else /* linux ELF */ #define pixman_have_mips_dspr2() FALSE +#define pixman_have_loongson_mmi() FALSE #endif /* linux ELF */ -#endif /* USE_MIPS_DSPR2 */ +#endif /* USE_MIPS_DSPR2 || USE_LOONGSON_MMI */ #if defined(USE_X86_MMX) || defined(USE_SSE2) /* The CPU detection code needs to be in a file not compiled with @@ -773,7 +789,10 @@ _pixman_choose_implementation (void) if (!disabled ("arm-iwmmxt") && pixman_have_arm_iwmmxt ()) imp = _pixman_implementation_create_mmx (imp); #endif - +#ifdef USE_LOONGSON_MMI + if (!disabled ("loongson-mmi") && pixman_have_loongson_mmi ()) + imp = _pixman_implementation_create_mmx (imp); +#endif #ifdef USE_ARM_NEON if (!disabled ("arm-neon") && pixman_have_arm_neon ()) imp = _pixman_implementation_create_arm_neon (imp); diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c index fe3d42d..86307b0 100644 --- a/pixman/pixman-mmx.c +++ b/pixman/pixman-mmx.c @@ -33,9 +33,13 @@ #include #endif -#if defined USE_X86_MMX || defined USE_ARM_IWMMXT +#if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI +#ifdef USE_LOONGSON_MMI +#include +#else #include +#endif #include "pixman-private.h" #include "pixman-combine32.h" @@ -125,11 +129,14 @@ _mm_shuffle_pi16 (__m64 __A, int8_t const __N) * uint64_t and __m64 values, then define USE_CVT_INTRINSICS. * If __m64 and uint64_t values can just be cast to each other directly, * then define USE_M64_CASTS. + * If __m64 is a double datatype, then define USE_M64_DOUBLE. */ #ifdef _MSC_VER # define M64_MEMBER m64_u64 #elif defined(__ICC) # define USE_CVT_INTRINSICS +#elif defined(USE_LOONGSON_MMI) +# define USE_M64_DOUBLE #elif defined(__GNUC__) # define USE_M64_CASTS #elif defined(__SUNPRO_C) @@ -147,7 +154,7 @@ _mm_shuffle_pi16 (__m64 __A, int8_t const __N) # endif #endif -#if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) +#if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE) typedef uint64_t mmxdatafield; #else typedef __m64 mmxdatafield; @@ -199,6 +206,8 @@ static const mmx_data_t c = # define MC(x) to_m64 (c.mmx_ ## x) #elif defined(USE_M64_CASTS) # define MC(x) ((__m64)c.mmx_ ## x) +#elif defined(USE_M64_DOUBLE) +# define MC(x) (*(__m64 *)&c.mmx_ ## x) #else # define MC(x) c.mmx_ ## x #endif @@ -213,6 +222,8 @@ to_m64 (uint64_t x) res.M64_MEMBER = x; return res; +#elif defined USE_M64_DOUBLE + return *(__m64 *)&x; #else /* USE_M64_CASTS */ return (__m64)x; #endif @@ -226,6 +237,8 @@ to_uint64 (__m64 x) #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */ uint64_t res = x.M64_MEMBER; return res; +#elif defined USE_M64_DOUBLE + return *(uint64_t *)&x; #else /* USE_M64_CASTS */ return (uint64_t)x; #endif @@ -358,13 +371,26 @@ static force_inline uint32_t ldl_u(const uint32_t *p) static force_inline __m64 load (const uint32_t *v) { +#ifdef USE_LOONGSON_MMI + __m64 ret; + asm ("lwc1 %0, %1\n\t" + : "=f" (ret) + : "m" (*v) + ); + return ret; +#else return _mm_cvtsi32_si64 (*v); +#endif } static force_inline __m64 load8888 (const uint32_t *v) { +#ifdef USE_LOONGSON_MMI + return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ()); +#else return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ()); +#endif } static force_inline __m64 @@ -383,7 +409,15 @@ pack8888 (__m64 lo, __m64 hi) static force_inline void store (uint32_t *dest, __m64 v) { +#ifdef USE_LOONGSON_MMI + asm ("swc1 %1, %0\n\t" + : "=m" (*dest) + : "f" (v) + : "memory" + ); +#else *dest = _mm_cvtsi64_si32 (v); +#endif } static force_inline void @@ -3275,4 +3309,4 @@ _pixman_implementation_create_mmx (pixman_implementation_t *fallback) return imp; } -#endif /* USE_X86_MMX || USE_ARM_IWMMXT */ +#endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */ diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h index 0cba2e9..f456bbd 100644 --- a/pixman/pixman-private.h +++ b/pixman/pixman-private.h @@ -539,7 +539,7 @@ _pixman_implementation_create_fast_path (pixman_implementation_t *fallback); pixman_implementation_t * _pixman_implementation_create_noop (pixman_implementation_t *fallback); -#if defined USE_X86_MMX || defined USE_ARM_IWMMXT +#if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI pixman_implementation_t * _pixman_implementation_create_mmx (pixman_implementation_t *fallback); #endif -- 2.7.4