From 8cc398f2ad934765c14e78c4ecc1f6860937b296 Mon Sep 17 00:00:00 2001
From: lucas <lucas@7cbeb6ba-43b4-40fd-8cce-4c39aea84d33>
Date: Fri, 30 Sep 2011 02:35:45 +0000
Subject: [PATCH] evas: add common SSE3 blending functions/macro mul_256_sse3
 sub4_alpha_sse3 interp4_256_sse3 mul_sym_sse3 mul4_sym_sse3 mul3_sym_sse3

LOOP_ALIGNED_U1_A48_SSE3

__attribute__((always_inline)) is needed to coax GCC (< 4.6.0)
into inlining the common blend ops. Not inlining these functions
causes a steep performance penalty.

Patch by: Jim Kukunas <james.t.kukunas@linux.intel.com>


git-svn-id: http://svn.enlightenment.org/svn/e/trunk/evas@63698 7cbeb6ba-43b4-40fd-8cce-4c39aea84d33
---
 configure.ac                     |   1 +
 m4/efl_attribute.m4              |  24 +++++
 src/lib/include/evas_blend_ops.h | 191 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 216 insertions(+)

diff --git a/configure.ac b/configure.ac
index 131c383..0ad2ce9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -440,6 +440,7 @@ AC_C_BIGENDIAN
 AM_PROG_CC_STDC
 EFL_ATTRIBUTE_UNUSED
 EFL_ATTRIBUTE_VECTOR
+EFL_ATTRIBUTE_ALWAYS_INLINE
 
 WIN32_CPPFLAGS=""
 case "$host_os" in
diff --git a/m4/efl_attribute.m4 b/m4/efl_attribute.m4
index d03db53..e814e36 100644
--- a/m4/efl_attribute.m4
+++ b/m4/efl_attribute.m4
@@ -51,4 +51,28 @@ if test "x${have_attribute_vector}" = "xyes" ; then
 fi
 ])
 
+dnl Usage: EFL_ATTRIBUTE_ALWAYS_INLINE
+dnl call AC_DEFINE for alway_inline if __attribute__((always_inline)) is available
+
+AC_DEFUN([EFL_ATTRIBUTE_ALWAYS_INLINE],
+[
+AC_MSG_CHECKING([for __attribute__ ((always_inline))])
+AC_COMPILE_IFELSE(
+   [AC_LANG_PROGRAM(
+       [[
+       ]],
+       [[
+__attribute__((always_inline)) void foo(void) {}
+       ]])],
+   [have_attribute_always_inline="yes"],
+   [have_attribute_always_inline="no"])
+AC_MSG_RESULT([${have_attribute_always_inline}])
+
+if test "x${have_attribute_always_inline}" = "xyes" ; then
+   AC_DEFINE([always_inline], [__attribute__ ((always_inline)) inline], [Macro declaring a function to always be inlined.])
+else
+   AC_DEFINE([always_inline], [inline], [Macro declaring a function to always be inlined.])
+fi
+])
+
 dnl End of efl_attribute.m4
diff --git a/src/lib/include/evas_blend_ops.h b/src/lib/include/evas_blend_ops.h
index 9647800..0c27316 100644
--- a/src/lib/include/evas_blend_ops.h
+++ b/src/lib/include/evas_blend_ops.h
@@ -5,6 +5,10 @@
 #include "evas_mmx.h"
 #endif
 
+#if defined BUILD_SSE3
+#include <immintrin.h>
+#endif
+
 /* src pixel flags: */
 
 /* pixels none */
@@ -178,4 +182,191 @@ extern const DATA32 ALPHA_256;
 
 #endif
 
+
+/* some useful SSE3 inline functions */
+
+#ifdef BUILD_SSE3
+
+static __m128i GA_MASK_SSE3;
+static __m128i RB_MASK_SSE3;
+static __m128i SYM4_MASK_SSE3;
+static __m128i RGB_MASK_SSE3;
+static __m128i A_MASK_SSE3;
+
+static __m128i ALPHA_SSE3;
+
+static always_inline __m128i
+mul_256_sse3(__m128i a, __m128i c) {
+
+   /* prepare alpha for word multiplication */
+   __m128i a_l = a;
+   __m128i a_h = a;
+   a_l = _mm_unpacklo_epi16(a_l, a_l);
+   a_h = _mm_unpackhi_epi16(a_h, a_h);
+   __m128i a0 = (__m128i) _mm_shuffle_ps( (__m128)a_l, (__m128)a_h, 0x88);
+
+   /* first half of calc */
+   __m128i c0 = c;
+   c0 = _mm_srli_epi32(c0, 8);
+   c0 = _mm_and_si128(GA_MASK_SSE3, c0);
+   c0 = _mm_mullo_epi16(a0, c0);
+   c0 = _mm_and_si128(RB_MASK_SSE3, c0);
+
+   /* second half of calc */
+   __m128i c1 = c;
+   c1 = _mm_and_si128(GA_MASK_SSE3, c1);
+   c1 = _mm_mullo_epi16(a0, c1);
+   c1 = _mm_srli_epi32(c1, 8);
+   c1 = _mm_and_si128(GA_MASK_SSE3, c1);
+
+   /* combine */
+   return _mm_add_epi32(c0, c1);
+}
+
+static always_inline __m128i
+sub4_alpha_sse3(__m128i c) {
+
+   __m128i c0 = c;
+
+   c0 = _mm_srli_epi32(c0, 24);
+   return _mm_sub_epi32(ALPHA_SSE3, c0);
+}
+
+static always_inline __m128i
+interp4_256_sse3(__m128i a, __m128i c0, __m128i c1)
+{
+   const __m128i zero = _mm_setzero_si128();
+
+   __m128i a_l = a;
+   __m128i a_h = a;
+   a_l = _mm_unpacklo_epi16(a_l, a_l);
+   a_h = _mm_unpackhi_epi16(a_h, a_h);
+
+   __m128i a_t = _mm_slli_epi64(a_l, 32);
+   __m128i a_t0 = _mm_slli_epi64(a_h, 32);
+
+   a_l = _mm_add_epi32(a_l, a_t);
+   a_h = _mm_add_epi32(a_h, a_t0);
+
+   __m128i c0_l = c0;
+   __m128i c0_h = c0;
+
+   c0_l = _mm_unpacklo_epi8(c0_l, zero);
+   c0_h = _mm_unpackhi_epi8(c0_h, zero);
+
+   __m128i c1_l = c1;
+   __m128i c1_h = c1;
+
+   c1_l = _mm_unpacklo_epi8(c1_l, zero);
+   c1_h = _mm_unpackhi_epi8(c1_h, zero);
+
+   __m128i cl_sub = _mm_sub_epi16(c0_l, c1_l);
+   __m128i ch_sub = _mm_sub_epi16(c0_h, c1_h);
+
+   cl_sub = _mm_mullo_epi16(cl_sub, a_l);
+   ch_sub = _mm_mullo_epi16(ch_sub, a_h);
+
+   __m128i c1ls = _mm_slli_epi16(c1_l, 8);
+   __m128i c1hs = _mm_slli_epi16(c1_h, 8);
+
+   cl_sub = _mm_add_epi16(cl_sub, c1ls);
+   ch_sub = _mm_add_epi16(ch_sub, c1hs);
+
+   cl_sub = _mm_and_si128(cl_sub, RB_MASK_SSE3);
+   ch_sub = _mm_and_si128(ch_sub, RB_MASK_SSE3);
+
+   cl_sub = _mm_srli_epi64(cl_sub, 8);
+   ch_sub = _mm_srli_epi64(ch_sub, 8);
+
+   cl_sub = _mm_packus_epi16(cl_sub, cl_sub);
+   ch_sub = _mm_packus_epi16(ch_sub, ch_sub);
+
+   return  (__m128i) _mm_shuffle_ps( (__m128)cl_sub, (__m128)ch_sub, 0x44);
+}
+
+static always_inline __m128i
+mul_sym_sse3(__m128i a, __m128i c) {
+
+      /* Prepare alpha for word mult */
+      __m128i a_l = a;
+      __m128i a_h = a;
+      a_l = _mm_unpacklo_epi16(a_l, a_l);
+      a_h = _mm_unpackhi_epi16(a_h, a_h);
+      __m128i a0 = (__m128i) _mm_shuffle_ps( (__m128)a_l, (__m128)a_h, 0x88);
+
+      /* first part */
+      __m128i c0 = c;
+      c0 = _mm_srli_epi32(c0, 8);
+      c0 = _mm_and_si128(GA_MASK_SSE3, c0);
+      c0 = _mm_mullo_epi16(a0, c0);
+      c0 = _mm_add_epi32(c0, GA_MASK_SSE3);
+      c0 = _mm_and_si128(RB_MASK_SSE3, c0);
+
+      /* second part */
+      __m128i c1 = c;
+      c1 = _mm_and_si128(GA_MASK_SSE3, c1);
+      c1 = _mm_mullo_epi16(a0, c1);
+      c1 = _mm_add_epi32(c1, GA_MASK_SSE3);
+      c1 = _mm_srli_epi32(c1, 8);
+      c1 = _mm_and_si128(GA_MASK_SSE3, c1);
+
+      return _mm_add_epi32(c0, c1);
+}
+
+static always_inline __m128i
+mul4_sym_sse3(__m128i x, __m128i y) {
+
+   const __m128i zero = _mm_setzero_si128();
+
+   __m128i x_l = _mm_unpacklo_epi8(x, zero);
+   __m128i x_h = _mm_unpackhi_epi8(x, zero);
+
+   __m128i y_l = _mm_unpacklo_epi8(y, zero);
+   __m128i y_h = _mm_unpackhi_epi8(y, zero);
+
+   __m128i r_l = _mm_mullo_epi16(x_l, y_l);
+   __m128i r_h = _mm_mullo_epi16(x_h, y_h);
+
+   r_l = _mm_add_epi16(r_l, SYM4_MASK_SSE3);
+   r_h = _mm_add_epi16(r_h, SYM4_MASK_SSE3);
+
+   r_l = _mm_srli_epi16(r_l, 8);
+   r_h = _mm_srli_epi16(r_h, 8);
+
+   return  _mm_packus_epi16(r_l, r_h);
+}
+
+static always_inline __m128i
+mul3_sym_sse3(__m128i x, __m128i y) {
+
+   __m128i res = mul4_sym_sse3(x, y);
+   return  _mm_and_si128(res, RGB_MASK_SSE3);
+}
+
+#define LOOP_ALIGNED_U1_A48_SSE3(D, LENGTH, UOP,A4OP, A8OP) \
+   { \
+      while((uintptr_t)d & 0xF && l) UOP \
+   \
+      while(l) { \
+         switch(l) { \
+            case 3: UOP \
+            case 2: UOP \
+            case 1: UOP \
+               break; \
+            case 7: \
+            case 6: \
+            case 5: \
+            case 4: \
+               A4OP \
+               break; \
+            default: \
+               A8OP \
+               break; \
+         } \
+      } \
+   }
+
+
+#endif
+
 #endif
-- 
2.7.4