From: lucas Date: Fri, 30 Sep 2011 02:36:01 +0000 (+0000) Subject: evas: add SSE3 op_blend_color blend functions X-Git-Tag: accepted/2.0/20130306.225542~242^2~29 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=6ec5a37a2e3676cd1fe6b4c72ed907800d2e8e78;p=profile%2Fivi%2Fevas.git evas: add SSE3 op_blend_color blend functions _op_blend_c_dp_sse3 _op_blend_rel_c_dp_sse3 Patch by: Jim Kukunas git-svn-id: svn+ssh://svn.enlightenment.org/var/svn/e/trunk/evas@63699 7cbeb6ba-43b4-40fd-8cce-4c39aea84d33 --- diff --git a/src/lib/engines/common/evas_op_blend/op_blend_color_sse3.c b/src/lib/engines/common/evas_op_blend/op_blend_color_sse3.c new file mode 100644 index 0000000..215c170 --- /dev/null +++ b/src/lib/engines/common/evas_op_blend/op_blend_color_sse3.c @@ -0,0 +1,165 @@ +/* blend color -> dst */ + +#ifdef BUILD_SSE3 + +static void +_op_blend_c_dp_sse3(DATA32 *s __UNUSED__, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l) { + + DATA32 a = 256 - (c >> 24); + + const __m128i c_packed = _mm_set_epi32(c, c, c, c); + const __m128i a_packed = _mm_set_epi32(a, a, a, a); + + LOOP_ALIGNED_U1_A48_SSE3(d, l, + { /* UOP */ + + *d = c + MUL_256(a, *d); + d++; l--; + }, + { /* A4OP */ + + __m128i d0 = _mm_load_si128((__m128i *)d); + + d0 = mul_256_sse3(a_packed, d0); + d0 = _mm_add_epi32(d0, c_packed); + + _mm_store_si128((__m128i *)d, d0); + + d += 4; l -= 4; + }, + { /* A8OP */ + + __m128i d0 = _mm_load_si128((__m128i *)d); + __m128i d1 = _mm_load_si128((__m128i *)(d+4)); + + d0 = mul_256_sse3(a_packed, d0); + d1 = mul_256_sse3(a_packed, d1); + + d0 = _mm_add_epi32(d0, c_packed); + d1 = _mm_add_epi32(d1, c_packed); + + _mm_store_si128((__m128i *)d, d0); + _mm_store_si128((__m128i *)(d+4), d1); + + d += 8; l -= 8; + }) +} + +#define _op_blend_caa_dp_sse3 _op_blend_c_dp_sse3 + +#define _op_blend_c_dpan_sse3 _op_blend_c_dp_sse3 +#define _op_blend_caa_dpan_sse3 _op_blend_c_dpan_sse3 + +static void +init_blend_color_span_funcs_sse3(void) +{ + op_blend_span_funcs[SP_N][SM_N][SC][DP][CPU_SSE3] = _op_blend_c_dp_sse3; + op_blend_span_funcs[SP_N][SM_N][SC_AA][DP][CPU_SSE3] = _op_blend_caa_dp_sse3; + + op_blend_span_funcs[SP_N][SM_N][SC][DP_AN][CPU_SSE3] = _op_blend_c_dpan_sse3; + op_blend_span_funcs[SP_N][SM_N][SC_AA][DP_AN][CPU_SSE3] = _op_blend_caa_dpan_sse3; +} + +#define _op_blend_pt_c_dp_sse3 NULL +#define _op_blend_pt_caa_dp_sse3 _op_blend_pt_c_dp_sse3 + +#define _op_blend_pt_c_dpan_sse3 _op_blend_pt_c_dp_sse3 +#define _op_blend_pt_caa_dpan_sse3 _op_blend_pt_c_dpan_sse3 + +#define _op_blend_pt_c_dpas_sse3 _op_blend_pt_c_dp_sse3 +#define _op_blend_pt_caa_dpas_sse3 _op_blend_pt_c_dp_sse3 + +static void +init_blend_color_pt_funcs_sse3(void) +{ + op_blend_pt_funcs[SP_N][SM_N][SC][DP][CPU_SSE3] = _op_blend_pt_c_dp_sse3; + op_blend_pt_funcs[SP_N][SM_N][SC_AA][DP][CPU_SSE3] = _op_blend_pt_caa_dp_sse3; + + op_blend_pt_funcs[SP_N][SM_N][SC][DP_AN][CPU_SSE3] = _op_blend_pt_c_dpan_sse3; + op_blend_pt_funcs[SP_N][SM_N][SC_AA][DP_AN][CPU_SSE3] = _op_blend_pt_caa_dpan_sse3; +} + + +/*-----*/ + +/* blend_rel color -> dst */ + +static void +_op_blend_rel_c_dp_sse3(DATA32 *s __UNUSED__, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l) { + + int alpha = 256 - (c >> 24); + + const __m128i c_packed = _mm_set_epi32(c, c, c, c); + const __m128i alpha_packed = _mm_set_epi32(alpha, alpha, alpha, alpha); + + LOOP_ALIGNED_U1_A48_SSE3(d, l, + { /* UOP */ + + *d = MUL_SYM(*d >> 24, c) + MUL_256(alpha, *d); + d++; l--; + }, + { /* A4OP */ + + __m128i d0 = _mm_load_si128((__m128i *)d); + + __m128i mul0 = mul_256_sse3(alpha_packed, d0); + __m128i sym0 = mul_sym_sse3(_mm_srli_epi32(d0, 24), c_packed); + + d0 = _mm_add_epi32(mul0, sym0); + + _mm_store_si128((__m128i *)d, d0); + + d += 4; l -= 4; + }, + { /* A8OP */ + + __m128i d0 = _mm_load_si128((__m128i *)d); + __m128i d1 = _mm_load_si128((__m128i *)(d+4)); + + __m128i mul0 = mul_256_sse3(alpha_packed, d0); + __m128i mul1 = mul_256_sse3(alpha_packed, d1); + + __m128i sym0 = mul_sym_sse3(_mm_srli_epi32(d0, 24), c_packed); + __m128i sym1 = mul_sym_sse3(_mm_srli_epi32(d1, 24), c_packed); + + d0 = _mm_add_epi32(mul0, sym0); + d1 = _mm_add_epi32(mul1, sym1); + + _mm_store_si128((__m128i *)d, d0); + _mm_store_si128((__m128i *)(d+4), d1); + + d += 8; l -= 8; + }) +} + +#define _op_blend_rel_caa_dp_sse3 _op_blend_rel_c_dp_sse3 +#define _op_blend_rel_c_dpan_sse3 _op_blend_c_dpan_sse3 +#define _op_blend_rel_caa_dpan_sse3 _op_blend_caa_dpan_sse3 + +static void +init_blend_rel_color_span_funcs_sse3(void) +{ + op_blend_rel_span_funcs[SP_N][SM_N][SC][DP][CPU_SSE3] = _op_blend_rel_c_dp_sse3; + op_blend_rel_span_funcs[SP_N][SM_N][SC_AA][DP][CPU_SSE3] = _op_blend_rel_caa_dp_sse3; + + op_blend_rel_span_funcs[SP_N][SM_N][SC][DP_AN][CPU_SSE3] = _op_blend_rel_c_dpan_sse3; + op_blend_rel_span_funcs[SP_N][SM_N][SC_AA][DP_AN][CPU_SSE3] = _op_blend_rel_caa_dpan_sse3; +} + +#define _op_blend_rel_pt_c_dp_sse3 NULL +#define _op_blend_rel_pt_caa_dp_sse3 _op_blend_rel_pt_c_dp_sse3 + +#define _op_blend_rel_pt_c_dpan_sse3 _op_blend_pt_c_dpan_sse3 +#define _op_blend_rel_pt_caa_dpan_sse3 _op_blend_pt_caa_dpan_sse3 + +static void +init_blend_rel_color_pt_funcs_sse3(void) +{ + op_blend_rel_pt_funcs[SP_N][SM_N][SC][DP][CPU_SSE3] = _op_blend_rel_pt_c_dp_sse3; + op_blend_rel_pt_funcs[SP_N][SM_N][SC_AA][DP][CPU_SSE3] = _op_blend_rel_pt_caa_dp_sse3; + + op_blend_rel_pt_funcs[SP_N][SM_N][SC][DP_AN][CPU_SSE3] = _op_blend_rel_pt_c_dpan_sse3; + op_blend_rel_pt_funcs[SP_N][SM_N][SC_AA][DP_AN][CPU_SSE3] = _op_blend_rel_pt_caa_dpan_sse3; +} + +#endif