From 8cc398f2ad934765c14e78c4ecc1f6860937b296 Mon Sep 17 00:00:00 2001 From: lucas Date: Fri, 30 Sep 2011 02:35:45 +0000 Subject: [PATCH] evas: add common SSE3 blending functions/macro mul_256_sse3 sub4_alpha_sse3 interp4_256_sse3 mul_sym_sse3 mul4_sym_sse3 mul3_sym_sse3 LOOP_ALIGNED_U1_A48_SSE3 __attribute__((always_inline)) is needed to coax GCC (< 4.6.0) into inlining the common blend ops. Not inlining these functions causes a steep performance penalty. Patch by: Jim Kukunas git-svn-id: http://svn.enlightenment.org/svn/e/trunk/evas@63698 7cbeb6ba-43b4-40fd-8cce-4c39aea84d33 --- configure.ac | 1 + m4/efl_attribute.m4 | 24 +++++ src/lib/include/evas_blend_ops.h | 191 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 216 insertions(+) diff --git a/configure.ac b/configure.ac index 131c383..0ad2ce9 100644 --- a/configure.ac +++ b/configure.ac @@ -440,6 +440,7 @@ AC_C_BIGENDIAN AM_PROG_CC_STDC EFL_ATTRIBUTE_UNUSED EFL_ATTRIBUTE_VECTOR +EFL_ATTRIBUTE_ALWAYS_INLINE WIN32_CPPFLAGS="" case "$host_os" in diff --git a/m4/efl_attribute.m4 b/m4/efl_attribute.m4 index d03db53..e814e36 100644 --- a/m4/efl_attribute.m4 +++ b/m4/efl_attribute.m4 @@ -51,4 +51,28 @@ if test "x${have_attribute_vector}" = "xyes" ; then fi ]) +dnl Usage: EFL_ATTRIBUTE_ALWAYS_INLINE +dnl call AC_DEFINE for alway_inline if __attribute__((always_inline)) is available + +AC_DEFUN([EFL_ATTRIBUTE_ALWAYS_INLINE], +[ +AC_MSG_CHECKING([for __attribute__ ((always_inline))]) +AC_COMPILE_IFELSE( + [AC_LANG_PROGRAM( + [[ + ]], + [[ +__attribute__((always_inline)) void foo(void) {} + ]])], + [have_attribute_always_inline="yes"], + [have_attribute_always_inline="no"]) +AC_MSG_RESULT([${have_attribute_always_inline}]) + +if test "x${have_attribute_always_inline}" = "xyes" ; then + AC_DEFINE([always_inline], [__attribute__ ((always_inline)) inline], [Macro declaring a function to always be inlined.]) +else + AC_DEFINE([always_inline], [inline], [Macro declaring a function to always be inlined.]) +fi +]) + dnl End of efl_attribute.m4 diff --git a/src/lib/include/evas_blend_ops.h b/src/lib/include/evas_blend_ops.h index 9647800..0c27316 100644 --- a/src/lib/include/evas_blend_ops.h +++ b/src/lib/include/evas_blend_ops.h @@ -5,6 +5,10 @@ #include "evas_mmx.h" #endif +#if defined BUILD_SSE3 +#include +#endif + /* src pixel flags: */ /* pixels none */ @@ -178,4 +182,191 @@ extern const DATA32 ALPHA_256; #endif + +/* some useful SSE3 inline functions */ + +#ifdef BUILD_SSE3 + +static __m128i GA_MASK_SSE3; +static __m128i RB_MASK_SSE3; +static __m128i SYM4_MASK_SSE3; +static __m128i RGB_MASK_SSE3; +static __m128i A_MASK_SSE3; + +static __m128i ALPHA_SSE3; + +static always_inline __m128i +mul_256_sse3(__m128i a, __m128i c) { + + /* prepare alpha for word multiplication */ + __m128i a_l = a; + __m128i a_h = a; + a_l = _mm_unpacklo_epi16(a_l, a_l); + a_h = _mm_unpackhi_epi16(a_h, a_h); + __m128i a0 = (__m128i) _mm_shuffle_ps( (__m128)a_l, (__m128)a_h, 0x88); + + /* first half of calc */ + __m128i c0 = c; + c0 = _mm_srli_epi32(c0, 8); + c0 = _mm_and_si128(GA_MASK_SSE3, c0); + c0 = _mm_mullo_epi16(a0, c0); + c0 = _mm_and_si128(RB_MASK_SSE3, c0); + + /* second half of calc */ + __m128i c1 = c; + c1 = _mm_and_si128(GA_MASK_SSE3, c1); + c1 = _mm_mullo_epi16(a0, c1); + c1 = _mm_srli_epi32(c1, 8); + c1 = _mm_and_si128(GA_MASK_SSE3, c1); + + /* combine */ + return _mm_add_epi32(c0, c1); +} + +static always_inline __m128i +sub4_alpha_sse3(__m128i c) { + + __m128i c0 = c; + + c0 = _mm_srli_epi32(c0, 24); + return _mm_sub_epi32(ALPHA_SSE3, c0); +} + +static always_inline __m128i +interp4_256_sse3(__m128i a, __m128i c0, __m128i c1) +{ + const __m128i zero = _mm_setzero_si128(); + + __m128i a_l = a; + __m128i a_h = a; + a_l = _mm_unpacklo_epi16(a_l, a_l); + a_h = _mm_unpackhi_epi16(a_h, a_h); + + __m128i a_t = _mm_slli_epi64(a_l, 32); + __m128i a_t0 = _mm_slli_epi64(a_h, 32); + + a_l = _mm_add_epi32(a_l, a_t); + a_h = _mm_add_epi32(a_h, a_t0); + + __m128i c0_l = c0; + __m128i c0_h = c0; + + c0_l = _mm_unpacklo_epi8(c0_l, zero); + c0_h = _mm_unpackhi_epi8(c0_h, zero); + + __m128i c1_l = c1; + __m128i c1_h = c1; + + c1_l = _mm_unpacklo_epi8(c1_l, zero); + c1_h = _mm_unpackhi_epi8(c1_h, zero); + + __m128i cl_sub = _mm_sub_epi16(c0_l, c1_l); + __m128i ch_sub = _mm_sub_epi16(c0_h, c1_h); + + cl_sub = _mm_mullo_epi16(cl_sub, a_l); + ch_sub = _mm_mullo_epi16(ch_sub, a_h); + + __m128i c1ls = _mm_slli_epi16(c1_l, 8); + __m128i c1hs = _mm_slli_epi16(c1_h, 8); + + cl_sub = _mm_add_epi16(cl_sub, c1ls); + ch_sub = _mm_add_epi16(ch_sub, c1hs); + + cl_sub = _mm_and_si128(cl_sub, RB_MASK_SSE3); + ch_sub = _mm_and_si128(ch_sub, RB_MASK_SSE3); + + cl_sub = _mm_srli_epi64(cl_sub, 8); + ch_sub = _mm_srli_epi64(ch_sub, 8); + + cl_sub = _mm_packus_epi16(cl_sub, cl_sub); + ch_sub = _mm_packus_epi16(ch_sub, ch_sub); + + return (__m128i) _mm_shuffle_ps( (__m128)cl_sub, (__m128)ch_sub, 0x44); +} + +static always_inline __m128i +mul_sym_sse3(__m128i a, __m128i c) { + + /* Prepare alpha for word mult */ + __m128i a_l = a; + __m128i a_h = a; + a_l = _mm_unpacklo_epi16(a_l, a_l); + a_h = _mm_unpackhi_epi16(a_h, a_h); + __m128i a0 = (__m128i) _mm_shuffle_ps( (__m128)a_l, (__m128)a_h, 0x88); + + /* first part */ + __m128i c0 = c; + c0 = _mm_srli_epi32(c0, 8); + c0 = _mm_and_si128(GA_MASK_SSE3, c0); + c0 = _mm_mullo_epi16(a0, c0); + c0 = _mm_add_epi32(c0, GA_MASK_SSE3); + c0 = _mm_and_si128(RB_MASK_SSE3, c0); + + /* second part */ + __m128i c1 = c; + c1 = _mm_and_si128(GA_MASK_SSE3, c1); + c1 = _mm_mullo_epi16(a0, c1); + c1 = _mm_add_epi32(c1, GA_MASK_SSE3); + c1 = _mm_srli_epi32(c1, 8); + c1 = _mm_and_si128(GA_MASK_SSE3, c1); + + return _mm_add_epi32(c0, c1); +} + +static always_inline __m128i +mul4_sym_sse3(__m128i x, __m128i y) { + + const __m128i zero = _mm_setzero_si128(); + + __m128i x_l = _mm_unpacklo_epi8(x, zero); + __m128i x_h = _mm_unpackhi_epi8(x, zero); + + __m128i y_l = _mm_unpacklo_epi8(y, zero); + __m128i y_h = _mm_unpackhi_epi8(y, zero); + + __m128i r_l = _mm_mullo_epi16(x_l, y_l); + __m128i r_h = _mm_mullo_epi16(x_h, y_h); + + r_l = _mm_add_epi16(r_l, SYM4_MASK_SSE3); + r_h = _mm_add_epi16(r_h, SYM4_MASK_SSE3); + + r_l = _mm_srli_epi16(r_l, 8); + r_h = _mm_srli_epi16(r_h, 8); + + return _mm_packus_epi16(r_l, r_h); +} + +static always_inline __m128i +mul3_sym_sse3(__m128i x, __m128i y) { + + __m128i res = mul4_sym_sse3(x, y); + return _mm_and_si128(res, RGB_MASK_SSE3); +} + +#define LOOP_ALIGNED_U1_A48_SSE3(D, LENGTH, UOP,A4OP, A8OP) \ + { \ + while((uintptr_t)d & 0xF && l) UOP \ + \ + while(l) { \ + switch(l) { \ + case 3: UOP \ + case 2: UOP \ + case 1: UOP \ + break; \ + case 7: \ + case 6: \ + case 5: \ + case 4: \ + A4OP \ + break; \ + default: \ + A8OP \ + break; \ + } \ + } \ + } + + +#endif + #endif -- 2.7.4