From f8dc0e98343c7936a37a3624721c5782e7ac309c Mon Sep 17 00:00:00 2001 From: Matt Turner Date: Thu, 17 May 2012 13:22:18 -0400 Subject: [PATCH] mmx: implement expand_4x565 in terms of expand_4xpacked565 Loongson: over_n_0565 = L1: 38.57 L2: 38.88 M: 30.01 ( 20.97%) HT: 23.60 VT: 23.88 R: 21.95 RT: 11.65 ( 113Kops/s) over_n_0565 = L1: 56.28 L2: 55.90 M: 34.20 ( 23.82%) HT: 25.66 VT: 26.60 R: 23.78 RT: 11.80 ( 115Kops/s) over_8888_0565 = L1: 35.89 L2: 36.11 M: 21.56 ( 45.47%) HT: 18.33 VT: 17.90 R: 16.27 RT: 9.07 ( 98Kops/s) over_8888_0565 = L1: 40.91 L2: 41.06 M: 23.13 ( 48.46%) HT: 19.24 VT: 18.71 R: 16.82 RT: 9.18 ( 99Kops/s) over_n_8_0565 = L1: 28.92 L2: 29.12 M: 21.42 ( 30.00%) HT: 18.37 VT: 17.75 R: 16.15 RT: 8.79 ( 91Kops/s) over_n_8_0565 = L1: 32.32 L2: 32.13 M: 22.44 ( 31.27%) HT: 19.15 VT: 18.66 R: 16.62 RT: 8.86 ( 92Kops/s) over_n_8888_0565_ca = L1: 29.33 L2: 29.22 M: 18.99 ( 66.69%) HT: 16.69 VT: 16.22 R: 14.63 RT: 8.42 ( 88Kops/s) over_n_8888_0565_ca = L1: 34.97 L2: 34.14 M: 20.32 ( 71.73%) HT: 17.67 VT: 17.19 R: 15.23 RT: 8.50 ( 89Kops/s) ARM/iwMMXt: over_n_0565 = L1: 29.70 L2: 30.53 M: 24.47 ( 14.84%) HT: 22.28 VT: 21.72 R: 21.13 RT: 12.58 ( 105Kops/s) over_n_0565 = L1: 41.42 L2: 40.00 M: 30.95 ( 19.13%) HT: 27.06 VT: 27.28 R: 23.43 RT: 14.44 ( 114Kops/s) over_8888_0565 = L1: 12.73 L2: 11.53 M: 9.07 ( 16.47%) HT: 9.00 VT: 9.25 R: 8.44 RT: 7.27 ( 76Kops/s) over_8888_0565 = L1: 23.72 L2: 21.76 M: 15.89 ( 29.51%) HT: 14.36 VT: 14.05 R: 12.44 RT: 8.94 ( 86Kops/s) over_n_8_0565 = L1: 6.80 L2: 7.15 M: 6.37 ( 7.90%) HT: 6.58 VT: 6.24 R: 6.49 RT: 5.94 ( 59Kops/s) over_n_8_0565 = L1: 12.06 L2: 11.02 M: 10.16 ( 13.43%) HT: 9.57 VT: 8.49 R: 9.10 RT: 6.86 ( 69Kops/s) over_n_8888_0565_ca = L1: 7.62 L2: 7.01 M: 6.27 ( 20.52%) HT: 6.00 VT: 6.07 R: 5.68 RT: 5.53 ( 57Kops/s) over_n_8888_0565_ca = L1: 13.54 L2: 11.96 M: 9.76 ( 30.66%) HT: 9.72 VT: 8.45 R: 9.37 RT: 6.85 ( 67Kops/s) --- pixman/pixman-mmx.c | 86 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 59 insertions(+), 27 deletions(-) diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c index af34755..70dd4e0 100644 --- a/pixman/pixman-mmx.c +++ b/pixman/pixman-mmx.c @@ -529,12 +529,14 @@ expand565 (__m64 pixel, int pos) * AARRGGBBRRGGBB */ static force_inline void -expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1) +expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha) { - __m64 t0, t1, alpha = _mm_cmpeq_pi32 (_mm_setzero_si64 (), _mm_setzero_si64 ()); + __m64 t0, t1, alpha = _mm_setzero_si64 ();; __m64 r = _mm_and_si64 (vin, MC (expand_565_r)); __m64 g = _mm_and_si64 (vin, MC (expand_565_g)); __m64 b = _mm_and_si64 (vin, MC (expand_565_b)); + if (full_alpha) + alpha = _mm_cmpeq_pi32 (alpha, alpha); /* Replicate high bits into empty low bits. */ r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13)); @@ -567,6 +569,17 @@ expandx888 (__m64 in, int pos) return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha)); } +static force_inline void +expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha) +{ + __m64 v0, v1; + expand_4xpacked565 (vin, &v0, &v1, full_alpha); + *vout0 = expand8888 (v0, 0); + *vout1 = expand8888 (v0, 1); + *vout2 = expand8888 (v1, 0); + *vout3 = expand8888 (v1, 1); +} + static force_inline __m64 pack_565 (__m64 pixel, __m64 target, int pos) { @@ -1442,11 +1455,14 @@ mmx_composite_over_n_0565 (pixman_implementation_t *imp, while (w >= 4) { __m64 vdest = *(__m64 *)dst; + __m64 v0, v1, v2, v3; - __m64 v0 = over (vsrc, vsrca, expand565 (vdest, 0)); - __m64 v1 = over (vsrc, vsrca, expand565 (vdest, 1)); - __m64 v2 = over (vsrc, vsrca, expand565 (vdest, 2)); - __m64 v3 = over (vsrc, vsrca, expand565 (vdest, 3)); + expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); + + v0 = over (vsrc, vsrca, v0); + v1 = over (vsrc, vsrca, v1); + v2 = over (vsrc, vsrca, v2); + v3 = over (vsrc, vsrca, v3); *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); @@ -1862,16 +1878,19 @@ mmx_composite_over_8888_0565 (pixman_implementation_t *imp, while (w >= 4) { __m64 vdest = *(__m64 *)dst; + __m64 v0, v1, v2, v3; + + expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); __m64 vsrc0 = load8888 ((src + 0)); __m64 vsrc1 = load8888 ((src + 1)); __m64 vsrc2 = load8888 ((src + 2)); __m64 vsrc3 = load8888 ((src + 3)); - __m64 v0 = over (vsrc0, expand_alpha (vsrc0), expand565 (vdest, 0)); - __m64 v1 = over (vsrc1, expand_alpha (vsrc1), expand565 (vdest, 1)); - __m64 v2 = over (vsrc2, expand_alpha (vsrc2), expand565 (vdest, 2)); - __m64 v3 = over (vsrc3, expand_alpha (vsrc3), expand565 (vdest, 3)); + v0 = over (vsrc0, expand_alpha (vsrc0), v0); + v1 = over (vsrc1, expand_alpha (vsrc1), v1); + v2 = over (vsrc2, expand_alpha (vsrc2), v2); + v3 = over (vsrc3, expand_alpha (vsrc3), v3); *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); @@ -2409,19 +2428,21 @@ mmx_composite_over_n_8_0565 (pixman_implementation_t *imp, else if (m0 | m1 | m2 | m3) { __m64 vdest = *(__m64 *)dst; + __m64 v0, v1, v2, v3; + + expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); __m64 vm0 = to_m64 (m0); - __m64 v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), - expand565 (vdest, 0)); + v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0); + __m64 vm1 = to_m64 (m1); - __m64 v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), - expand565 (vdest, 1)); + v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1); + __m64 vm2 = to_m64 (m2); - __m64 v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), - expand565 (vdest, 2)); + v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2); + __m64 vm3 = to_m64 (m3); - __m64 v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), - expand565 (vdest, 3)); + v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3); *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);; } @@ -2530,11 +2551,19 @@ mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp, else if (s0 | s1 | s2 | s3) { __m64 vdest = *(__m64 *)dst; + __m64 v0, v1, v2, v3; - __m64 v0 = over_rev_non_pre (load8888 (&s0), expand565 (vdest, 0)); - __m64 v1 = over_rev_non_pre (load8888 (&s1), expand565 (vdest, 1)); - __m64 v2 = over_rev_non_pre (load8888 (&s2), expand565 (vdest, 2)); - __m64 v3 = over_rev_non_pre (load8888 (&s3), expand565 (vdest, 3)); + __m64 vsrc0 = load8888 (&s0); + __m64 vsrc1 = load8888 (&s1); + __m64 vsrc2 = load8888 (&s2); + __m64 vsrc3 = load8888 (&s3); + + expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); + + v0 = over_rev_non_pre (vsrc0, v0); + v1 = over_rev_non_pre (vsrc1, v1); + v2 = over_rev_non_pre (vsrc2, v2); + v3 = over_rev_non_pre (vsrc3, v3); *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); } @@ -2710,11 +2739,14 @@ mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, if ((m0 | m1 | m2 | m3)) { __m64 vdest = *(__m64 *)q; + __m64 v0, v1, v2, v3; + + expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); - __m64 v0 = in_over (vsrc, vsrca, load8888 (&m0), expand565 (vdest, 0)); - __m64 v1 = in_over (vsrc, vsrca, load8888 (&m1), expand565 (vdest, 1)); - __m64 v2 = in_over (vsrc, vsrca, load8888 (&m2), expand565 (vdest, 2)); - __m64 v3 = in_over (vsrc, vsrca, load8888 (&m3), expand565 (vdest, 3)); + v0 = in_over (vsrc, vsrca, load8888 (&m0), v0); + v1 = in_over (vsrc, vsrca, load8888 (&m1), v1); + v2 = in_over (vsrc, vsrca, load8888 (&m2), v2); + v3 = in_over (vsrc, vsrca, load8888 (&m3), v3); *(__m64 *)q = pack_4x565 (v0, v1, v2, v3); } @@ -3382,7 +3414,7 @@ mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask) __m64 vsrc = ldq_u ((__m64 *)src); __m64 mm0, mm1; - expand_4xpacked565 (vsrc, &mm0, &mm1); + expand_4xpacked565 (vsrc, &mm0, &mm1, 1); *(__m64 *)(dst + 0) = mm0; *(__m64 *)(dst + 2) = mm1; -- 2.7.4