From 7d4beedc612a32b73d7673bbf6447de0f3fca298 Mon Sep 17 00:00:00 2001 From: Matt Turner Date: Wed, 9 May 2012 19:20:55 -0400 Subject: [PATCH] mmx: add and use pack_4x565 function The pack_4x565 makes use of the pack_4xpacked565 function which uses pmadd. Some of the speed up is probably attributable to removing the artificial serialization imposed by the vdest = pack_565 (..., vdest, 0); vdest = pack_565 (..., vdest, 1); ... pattern. Loongson: over_n_0565 = L1: 16.44 L2: 16.42 M: 13.83 ( 9.85%) HT: 12.83 VT: 12.61 R: 12.34 RT: 8.90 ( 93Kops/s) over_n_0565 = L1: 42.48 L2: 42.53 M: 29.83 ( 21.20%) HT: 23.39 VT: 23.72 R: 21.80 RT: 11.60 ( 113Kops/s) over_8888_0565 = L1: 15.61 L2: 15.42 M: 12.11 ( 25.79%) HT: 11.07 VT: 10.70 R: 10.37 RT: 7.25 ( 82Kops/s) over_8888_0565 = L1: 35.01 L2: 35.20 M: 21.42 ( 45.57%) HT: 18.12 VT: 17.61 R: 16.09 RT: 9.01 ( 97Kops/s) over_n_8_0565 = L1: 15.17 L2: 14.94 M: 12.57 ( 17.86%) HT: 11.96 VT: 11.52 R: 10.79 RT: 7.31 ( 79Kops/s) over_n_8_0565 = L1: 29.83 L2: 29.79 M: 21.85 ( 30.94%) HT: 18.82 VT: 18.25 R: 16.15 RT: 8.72 ( 91Kops/s) over_n_8888_0565_ca = L1: 15.25 L2: 15.02 M: 11.64 ( 41.39%) HT: 11.08 VT: 10.72 R: 10.02 RT: 7.00 ( 77Kops/s) over_n_8888_0565_ca = L1: 30.12 L2: 29.99 M: 19.47 ( 68.99%) HT: 17.05 VT: 16.55 R: 14.67 RT: 8.38 ( 88Kops/s) ARM/iwMMXt: over_n_0565 = L1: 19.29 L2: 19.88 M: 17.38 ( 10.54%) HT: 15.53 VT: 16.11 R: 13.69 RT: 11.00 ( 96Kops/s) over_n_0565 = L1: 36.02 L2: 34.85 M: 28.04 ( 16.97%) HT: 22.12 VT: 24.21 R: 22.36 RT: 12.22 ( 103Kops/s) over_8888_0565 = L1: 18.38 L2: 16.59 M: 12.34 ( 22.29%) HT: 11.67 VT: 11.71 R: 11.02 RT: 6.89 ( 72Kops/s) over_8888_0565 = L1: 24.96 L2: 22.17 M: 15.11 ( 26.81%) HT: 14.14 VT: 13.71 R: 13.18 RT: 8.13 ( 78Kops/s) over_n_8_0565 = L1: 14.65 L2: 12.44 M: 11.56 ( 14.50%) HT: 10.93 VT: 10.39 R: 10.06 RT: 7.05 ( 70Kops/s) over_n_8_0565 = L1: 18.37 L2: 14.98 M: 13.97 ( 16.51%) HT: 12.67 VT: 10.35 R: 11.80 RT: 8.14 ( 74Kops/s) over_n_8888_0565_ca = L1: 14.27 L2: 12.93 M: 10.52 ( 33.23%) HT: 9.70 VT: 9.90 R: 9.31 RT: 6.34 ( 65Kops/s) over_n_8888_0565_ca = L1: 19.69 L2: 17.58 M: 13.40 ( 42.35%) HT: 11.75 VT: 11.33 R: 11.17 RT: 7.49 ( 73Kops/s) --- pixman/pixman-mmx.c | 107 +++++++++++++++++++++++++--------------------------- 1 file changed, 52 insertions(+), 55 deletions(-) diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c index b14201a..01a2bc9 100644 --- a/pixman/pixman-mmx.c +++ b/pixman/pixman-mmx.c @@ -598,6 +598,12 @@ pack_4xpacked565 (__m64 a, __m64 b) #endif } +static force_inline __m64 +pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3) +{ + return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3)); +} + #ifndef _MSC_VER static force_inline __m64 @@ -1396,16 +1402,14 @@ mmx_composite_over_n_0565 (pixman_implementation_t *imp, while (w >= 4) { - __m64 vdest; + __m64 vdest = *(__m64 *)dst; - vdest = *(__m64 *)dst; - - vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 0)), vdest, 0); - vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 1)), vdest, 1); - vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 2)), vdest, 2); - vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 3)), vdest, 3); + __m64 v0 = over (vsrc, vsrca, expand565 (vdest, 0)); + __m64 v1 = over (vsrc, vsrca, expand565 (vdest, 1)); + __m64 v2 = over (vsrc, vsrca, expand565 (vdest, 2)); + __m64 v3 = over (vsrc, vsrca, expand565 (vdest, 3)); - *(__m64 *)dst = vdest; + *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); dst += 4; w -= 4; @@ -1818,22 +1822,19 @@ mmx_composite_over_8888_0565 (pixman_implementation_t *imp, while (w >= 4) { - __m64 vsrc0, vsrc1, vsrc2, vsrc3; - __m64 vdest; + __m64 vdest = *(__m64 *)dst; - vsrc0 = load8888 ((src + 0)); - vsrc1 = load8888 ((src + 1)); - vsrc2 = load8888 ((src + 2)); - vsrc3 = load8888 ((src + 3)); + __m64 vsrc0 = load8888 ((src + 0)); + __m64 vsrc1 = load8888 ((src + 1)); + __m64 vsrc2 = load8888 ((src + 2)); + __m64 vsrc3 = load8888 ((src + 3)); - vdest = *(__m64 *)dst; - - vdest = pack_565 (over (vsrc0, expand_alpha (vsrc0), expand565 (vdest, 0)), vdest, 0); - vdest = pack_565 (over (vsrc1, expand_alpha (vsrc1), expand565 (vdest, 1)), vdest, 1); - vdest = pack_565 (over (vsrc2, expand_alpha (vsrc2), expand565 (vdest, 2)), vdest, 2); - vdest = pack_565 (over (vsrc3, expand_alpha (vsrc3), expand565 (vdest, 3)), vdest, 3); + __m64 v0 = over (vsrc0, expand_alpha (vsrc0), expand565 (vdest, 0)); + __m64 v1 = over (vsrc1, expand_alpha (vsrc1), expand565 (vdest, 1)); + __m64 v2 = over (vsrc2, expand_alpha (vsrc2), expand565 (vdest, 2)); + __m64 v3 = over (vsrc3, expand_alpha (vsrc3), expand565 (vdest, 3)); - *(__m64 *)dst = vdest; + *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); w -= 4; dst += 4; @@ -2368,25 +2369,22 @@ mmx_composite_over_n_8_0565 (pixman_implementation_t *imp, } else if (m0 | m1 | m2 | m3) { - __m64 vdest; - __m64 vm0, vm1, vm2, vm3; - - vdest = *(__m64 *)dst; + __m64 vdest = *(__m64 *)dst; - vm0 = to_m64 (m0); - vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm0), - expand565 (vdest, 0)), vdest, 0); - vm1 = to_m64 (m1); - vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm1), - expand565 (vdest, 1)), vdest, 1); - vm2 = to_m64 (m2); - vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm2), - expand565 (vdest, 2)), vdest, 2); - vm3 = to_m64 (m3); - vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm3), - expand565 (vdest, 3)), vdest, 3); - - *(__m64 *)dst = vdest; + __m64 vm0 = to_m64 (m0); + __m64 v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), + expand565 (vdest, 0)); + __m64 vm1 = to_m64 (m1); + __m64 v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), + expand565 (vdest, 1)); + __m64 vm2 = to_m64 (m2); + __m64 v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), + expand565 (vdest, 2)); + __m64 vm3 = to_m64 (m3); + __m64 v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), + expand565 (vdest, 3)); + + *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);; } w -= 4; @@ -2483,24 +2481,23 @@ mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp, if ((a0 & a1 & a2 & a3) == 0xFF) { - __m64 vdest; - vdest = pack_565 (invert_colors (load8888 (&s0)), _mm_setzero_si64 (), 0); - vdest = pack_565 (invert_colors (load8888 (&s1)), vdest, 1); - vdest = pack_565 (invert_colors (load8888 (&s2)), vdest, 2); - vdest = pack_565 (invert_colors (load8888 (&s3)), vdest, 3); + __m64 v0 = invert_colors (load8888 (&s0)); + __m64 v1 = invert_colors (load8888 (&s1)); + __m64 v2 = invert_colors (load8888 (&s2)); + __m64 v3 = invert_colors (load8888 (&s3)); - *(__m64 *)dst = vdest; + *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); } else if (s0 | s1 | s2 | s3) { __m64 vdest = *(__m64 *)dst; - vdest = pack_565 (over_rev_non_pre (load8888 (&s0), expand565 (vdest, 0)), vdest, 0); - vdest = pack_565 (over_rev_non_pre (load8888 (&s1), expand565 (vdest, 1)), vdest, 1); - vdest = pack_565 (over_rev_non_pre (load8888 (&s2), expand565 (vdest, 2)), vdest, 2); - vdest = pack_565 (over_rev_non_pre (load8888 (&s3), expand565 (vdest, 3)), vdest, 3); + __m64 v0 = over_rev_non_pre (load8888 (&s0), expand565 (vdest, 0)); + __m64 v1 = over_rev_non_pre (load8888 (&s1), expand565 (vdest, 1)); + __m64 v2 = over_rev_non_pre (load8888 (&s2), expand565 (vdest, 2)); + __m64 v3 = over_rev_non_pre (load8888 (&s3), expand565 (vdest, 3)); - *(__m64 *)dst = vdest; + *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); } w -= 4; @@ -2675,12 +2672,12 @@ mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, { __m64 vdest = *(__m64 *)q; - vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m0), expand565 (vdest, 0)), vdest, 0); - vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m1), expand565 (vdest, 1)), vdest, 1); - vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m2), expand565 (vdest, 2)), vdest, 2); - vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m3), expand565 (vdest, 3)), vdest, 3); + __m64 v0 = in_over (vsrc, vsrca, load8888 (&m0), expand565 (vdest, 0)); + __m64 v1 = in_over (vsrc, vsrca, load8888 (&m1), expand565 (vdest, 1)); + __m64 v2 = in_over (vsrc, vsrca, load8888 (&m2), expand565 (vdest, 2)); + __m64 v3 = in_over (vsrc, vsrca, load8888 (&m3), expand565 (vdest, 3)); - *(__m64 *)q = vdest; + *(__m64 *)q = pack_4x565 (v0, v1, v2, v3); } twidth -= 4; p += 4; -- 2.7.4