From c78e986085b3993f1b4355151820228c53d54cad Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Fri, 24 Feb 2012 12:43:43 -0500
Subject: [PATCH] mmx: compile on MIPS for Loongson MMI optimizations

                             image               image16
           evolution   32.985 ->  29.667    27.314 ->  23.870
firefox-planet-gnome  197.982 -> 180.437   220.986 -> 205.057
gnome-system-monitor   48.482 ->  49.752    52.820 ->  49.528
  gnome-terminal-vim   60.799 ->  50.528    51.655 ->  44.131
      grads-heat-map    3.167 ->   3.181     3.328 ->   3.321
                gvim   38.646 ->  32.552    38.126 ->  34.453
       midori-zoomed   44.371 ->  43.338    28.860 ->  28.865
           ocitysmap   23.065 ->  18.057    23.046 ->  18.055
             poppler   43.676 ->  36.077    43.065 ->  36.090
  swfdec-giant-steps   20.166 ->  20.365    22.354 ->  16.578
      swfdec-youtube   31.502 ->  28.118    44.052 ->  41.771
   xfce4-terminal-a1   69.517 ->  51.288    62.225 ->  53.309
---
 configure.ac               |  54 +++++++++++
 pixman/Makefile.am         |  12 +++
 pixman/loongson-mmintrin.h | 218 +++++++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-cpu.c        |  37 ++++++--
 pixman/pixman-mmx.c        |  40 ++++++++-
 pixman/pixman-private.h    |   2 +-
 6 files changed, 350 insertions(+), 13 deletions(-)
 create mode 100644 pixman/loongson-mmintrin.h

diff --git a/configure.ac b/configure.ac
index b8d59f6..5478734 100644
--- a/configure.ac
+++ b/configure.ac
@@ -271,6 +271,59 @@ PIXMAN_CHECK_CFLAG([-xldscope=hidden], [dnl
 ])
 
 dnl ===========================================================================
+dnl Check for Loongson Multimedia Instructions
+
+if test "x$LS_CFLAGS" = "x" ; then
+    LS_CFLAGS="-march=loongson2f"
+fi
+
+have_loongson_mmi=no
+AC_MSG_CHECKING(whether to use Loongson MMI)
+
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS=" $CFLAGS $LS_CFLAGS"
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+#ifndef __mips_loongson_vector_rev
+#error "Loongson Multimedia Instructions are only available on Loongson"
+#endif
+#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 4))
+#error "Need GCC >= 4.4 for Loongson MMI compilation"
+#endif
+#include "pixman/loongson-mmintrin.h"
+int main () {
+    union {
+        __m64 v;
+        char c[8];
+    } a = { .c = {1, 2, 3, 4, 5, 6, 7, 8} };
+    int b = 4;
+    __m64 c = _mm_srli_pi16 (a.v, b);
+    return 0;
+}]])], have_loongson_mmi=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(loongson,
+   [AC_HELP_STRING([--disable-loongson],
+                   [disable Loongson fast paths])],
+   [enable_loongson=$enableval], [enable_loongson=auto])
+
+if test $enable_loongson = no ; then
+   have_loongson_mmi=disabled
+fi
+
+if test $have_loongson_mmi = yes ; then
+   AC_DEFINE(USE_LOONGSON_MMI, 1, [use Loongson Multimedia Instructions])
+else
+   LS_CFLAGS=
+fi
+
+AC_MSG_RESULT($have_loongson_mmi)
+if test $enable_loongson = yes && test $have_loongson_mmi = no ; then
+   AC_MSG_ERROR([Loongson MMI not detected])
+fi
+
+AM_CONDITIONAL(USE_LOONGSON_MMI, test $have_loongson_mmi = yes)
+
+dnl ===========================================================================
 dnl Check for MMX
 
 if test "x$MMX_CFLAGS" = "x" ; then
@@ -416,6 +469,7 @@ case $host_os in
       ;;
 esac
 
+AC_SUBST(LS_CFLAGS)
 AC_SUBST(IWMMXT_CFLAGS)
 AC_SUBST(MMX_CFLAGS)
 AC_SUBST(MMX_LDFLAGS)
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index fb7e047..b320a58 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -118,5 +118,17 @@ libpixman_1_la_LIBADD += libpixman-mips-dspr2.la
 ASM_CFLAGS_mips_dspr2=
 endif
 
+# loongson code
+if USE_LOONGSON_MMI
+noinst_LTLIBRARIES += libpixman-loongson-mmi.la
+libpixman_loongson_mmi_la_SOURCES = pixman-mmx.c loongson-mmintrin.h
+libpixman_loongson_mmi_la_CFLAGS = $(DEP_CFLAGS) $(LS_CFLAGS)
+libpixman_loongson_mmi_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LDFLAGS += $(LS_LDFLAGS)
+libpixman_1_la_LIBADD += libpixman-loongson-mmi.la
+
+ASM_CFLAGS_ls=$(LS_CFLAGS)
+endif
+
 .c.s : $(libpixmaninclude_HEADERS) $(BUILT_SOURCES)
 	$(CC) $(CFLAGS) $(ASM_CFLAGS_$(@:pixman-%.s=%)) $(ASM_CFLAGS_$(@:pixman-arm-%.s=arm_%)) -DHAVE_CONFIG_H -I$(srcdir) -I$(builddir) -I$(top_builddir) -S -o $@ $<
diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h
new file mode 100644
index 0000000..44d30f5
--- /dev/null
+++ b/pixman/loongson-mmintrin.h
@@ -0,0 +1,218 @@
+/* The gcc-provided loongson intrinsic functions are way too fucking broken
+ * to be of any use, otherwise I'd use them.
+ *
+ * - The hardware instructions are very similar to MMX or iwMMXt. Certainly
+ *   close enough that they could have implemented the _mm_*-style intrinsic
+ *   interface and had a ton of optimized code available to them. Instead they
+ *   implemented something much, much worse.
+ *
+ * - pshuf takes a dead first argument, causing extra instructions to be
+ *   generated.
+ *
+ * - There are no 64-bit shift or logical intrinsics, which means you have
+ *   to implement them with inline assembly, but this is a nightmare because
+ *   gcc doesn't understand that the integer vector datatypes are actually in
+ *   floating-point registers, so you end up with braindead code like
+ *
+ *	punpcklwd	$f9,$f9,$f5
+ *	    dmtc1	v0,$f8
+ *	punpcklwd	$f19,$f19,$f5
+ *	    dmfc1	t9,$f9
+ *	    dmtc1	v0,$f9
+ *	    dmtc1	t9,$f20
+ *	    dmfc1	s0,$f19
+ *	punpcklbh	$f20,$f20,$f2
+ *
+ *   where crap just gets copied back and forth between integer and floating-
+ *   point registers ad nauseum.
+ *
+ * Instead of trying to workaround the problems from these crap intrinsics, I
+ * just implement the _mm_* intrinsics needed for pixman-mmx.c using inline
+ * assembly.
+ */
+
+#include <stdint.h>
+
+/* vectors are stored in 64-bit floating-point registers */
+typedef double __m64;
+/* having a 32-bit datatype allows us to use 32-bit loads in places like load8888 */
+typedef float  __m32;
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_si64 (void)
+{
+	return 0.0;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_pu16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("paddush %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_pu8 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("paddusb %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_and_si64 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("and %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_empty (void)
+{
+
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhi_pu16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("pmulhuh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mullo_pi16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("pmullh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_si64 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("or %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_pu16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("packushb %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_pi16 (__m64 __m, int64_t __n)
+{
+	__m64 ret;
+	asm("pshufh %0, %1, %2\n\t"
+	    : "=f" (ret)
+	    : "f" (__m), "f" (*(__m64 *)&__n)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_si64 (__m64 __m, int64_t __count)
+{
+	__m64 ret;
+	asm("dsll  %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m), "f" (*(__m64 *)&__count)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_pi16 (__m64 __m, int64_t __count)
+{
+	__m64 ret;
+	asm("psrlh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m), "f" (*(__m64 *)&__count)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_si64 (__m64 __m, int64_t __count)
+{
+	__m64 ret;
+	asm("dsrl  %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m), "f" (*(__m64 *)&__count)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("punpckhbh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("punpcklbh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+/* Since punpcklbh doesn't care about the high 32-bits, we use the __m32 datatype which
+ * allows load8888 to use 32-bit loads */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pi8_f (__m32 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("punpcklbh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_si64 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("xor %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
diff --git a/pixman/pixman-cpu.c b/pixman/pixman-cpu.c
index 1060f47..aa9036f 100644
--- a/pixman/pixman-cpu.c
+++ b/pixman/pixman-cpu.c
@@ -427,22 +427,19 @@ pixman_have_arm_iwmmxt (void)
 
 #endif /* USE_ARM_SIMD || USE_ARM_NEON || USE_ARM_IWMMXT */
 
-#if defined(USE_MIPS_DSPR2)
+#if defined(USE_MIPS_DSPR2) || defined(USE_LOONGSON_MMI)
 
 #if defined (__linux__) /* linux ELF */
 
-pixman_bool_t
-pixman_have_mips_dspr2 (void)
+static pixman_bool_t
+pixman_have_mips_feature (const char *search_string)
 {
-    const char *search_string = "MIPS 74K";
     const char *file_name = "/proc/cpuinfo";
-    /* Simple detection of MIPS DSP ASE (revision 2) at runtime for Linux.
+    /* Simple detection of MIPS features at runtime for Linux.
      * It is based on /proc/cpuinfo, which reveals hardware configuration
      * to user-space applications.  According to MIPS (early 2010), no similar
      * facility is universally available on the MIPS architectures, so it's up
      * to individual OSes to provide such.
-     *
-     * Only currently available MIPS core that supports DSPr2 is 74K.
      */
 
     char cpuinfo_line[256];
@@ -467,13 +464,32 @@ pixman_have_mips_dspr2 (void)
     return FALSE;
 }
 
+#if defined(USE_MIPS_DSPR2)
+pixman_bool_t
+pixman_have_mips_dspr2 (void)
+{
+     /* Only currently available MIPS core that supports DSPr2 is 74K. */
+    return pixman_have_mips_feature ("MIPS 74K");
+}
+#endif
+
+#if defined(USE_LOONGSON_MMI)
+pixman_bool_t
+pixman_have_loongson_mmi (void)
+{
+    /* I really don't know if some Loongson CPUs don't have MMI. */
+    return pixman_have_mips_feature ("Loongson");
+}
+#endif
+
 #else /* linux ELF */
 
 #define pixman_have_mips_dspr2() FALSE
+#define pixman_have_loongson_mmi() FALSE
 
 #endif /* linux ELF */
 
-#endif /* USE_MIPS_DSPR2 */
+#endif /* USE_MIPS_DSPR2 || USE_LOONGSON_MMI */
 
 #if defined(USE_X86_MMX) || defined(USE_SSE2)
 /* The CPU detection code needs to be in a file not compiled with
@@ -773,7 +789,10 @@ _pixman_choose_implementation (void)
     if (!disabled ("arm-iwmmxt") && pixman_have_arm_iwmmxt ())
 	imp = _pixman_implementation_create_mmx (imp);
 #endif
-
+#ifdef USE_LOONGSON_MMI
+    if (!disabled ("loongson-mmi") && pixman_have_loongson_mmi ())
+	imp = _pixman_implementation_create_mmx (imp);
+#endif
 #ifdef USE_ARM_NEON
     if (!disabled ("arm-neon") && pixman_have_arm_neon ())
 	imp = _pixman_implementation_create_arm_neon (imp);
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index fe3d42d..86307b0 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -33,9 +33,13 @@
 #include <config.h>
 #endif
 
-#if defined USE_X86_MMX || defined USE_ARM_IWMMXT
+#if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
 
+#ifdef USE_LOONGSON_MMI
+#include <loongson-mmintrin.h>
+#else
 #include <mmintrin.h>
+#endif
 #include "pixman-private.h"
 #include "pixman-combine32.h"
 
@@ -125,11 +129,14 @@ _mm_shuffle_pi16 (__m64 __A, int8_t const __N)
  * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
  * If __m64 and uint64_t values can just be cast to each other directly,
  * then define USE_M64_CASTS.
+ * If __m64 is a double datatype, then define USE_M64_DOUBLE.
  */
 #ifdef _MSC_VER
 # define M64_MEMBER m64_u64
 #elif defined(__ICC)
 # define USE_CVT_INTRINSICS
+#elif defined(USE_LOONGSON_MMI)
+# define USE_M64_DOUBLE
 #elif defined(__GNUC__)
 # define USE_M64_CASTS
 #elif defined(__SUNPRO_C)
@@ -147,7 +154,7 @@ _mm_shuffle_pi16 (__m64 __A, int8_t const __N)
 # endif
 #endif
 
-#if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS)
+#if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
 typedef uint64_t mmxdatafield;
 #else
 typedef __m64 mmxdatafield;
@@ -199,6 +206,8 @@ static const mmx_data_t c =
 #    define MC(x) to_m64 (c.mmx_ ## x)
 #elif defined(USE_M64_CASTS)
 #    define MC(x) ((__m64)c.mmx_ ## x)
+#elif defined(USE_M64_DOUBLE)
+#    define MC(x) (*(__m64 *)&c.mmx_ ## x)
 #else
 #    define MC(x) c.mmx_ ## x
 #endif
@@ -213,6 +222,8 @@ to_m64 (uint64_t x)
 
     res.M64_MEMBER = x;
     return res;
+#elif defined USE_M64_DOUBLE
+    return *(__m64 *)&x;
 #else /* USE_M64_CASTS */
     return (__m64)x;
 #endif
@@ -226,6 +237,8 @@ to_uint64 (__m64 x)
 #elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
     uint64_t res = x.M64_MEMBER;
     return res;
+#elif defined USE_M64_DOUBLE
+    return *(uint64_t *)&x;
 #else /* USE_M64_CASTS */
     return (uint64_t)x;
 #endif
@@ -358,13 +371,26 @@ static force_inline uint32_t ldl_u(const uint32_t *p)
 static force_inline __m64
 load (const uint32_t *v)
 {
+#ifdef USE_LOONGSON_MMI
+    __m64 ret;
+    asm ("lwc1 %0, %1\n\t"
+	: "=f" (ret)
+	: "m" (*v)
+    );
+    return ret;
+#else
     return _mm_cvtsi32_si64 (*v);
+#endif
 }
 
 static force_inline __m64
 load8888 (const uint32_t *v)
 {
+#ifdef USE_LOONGSON_MMI
+    return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
+#else
     return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
+#endif
 }
 
 static force_inline __m64
@@ -383,7 +409,15 @@ pack8888 (__m64 lo, __m64 hi)
 static force_inline void
 store (uint32_t *dest, __m64 v)
 {
+#ifdef USE_LOONGSON_MMI
+    asm ("swc1 %1, %0\n\t"
+	: "=m" (*dest)
+	: "f" (v)
+	: "memory"
+    );
+#else
     *dest = _mm_cvtsi64_si32 (v);
+#endif
 }
 
 static force_inline void
@@ -3275,4 +3309,4 @@ _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
     return imp;
 }
 
-#endif /* USE_X86_MMX || USE_ARM_IWMMXT */
+#endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index 0cba2e9..f456bbd 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -539,7 +539,7 @@ _pixman_implementation_create_fast_path (pixman_implementation_t *fallback);
 pixman_implementation_t *
 _pixman_implementation_create_noop (pixman_implementation_t *fallback);
 
-#if defined USE_X86_MMX || defined USE_ARM_IWMMXT
+#if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
 pixman_implementation_t *
 _pixman_implementation_create_mmx (pixman_implementation_t *fallback);
 #endif
-- 
2.7.4