From 51681a052f9e1d0970a79187974da77d9bf69450 Mon Sep 17 00:00:00 2001 From: Matt Turner Date: Sun, 13 May 2012 20:39:05 -0400 Subject: [PATCH] mmx: add and use expand_4xpacked565 function Loongson: add_0565_0565 = L1: 14.39 L2: 13.98 M: 11.28 ( 15.22%) HT: 10.11 VT: 9.74 R: 9.39 RT: 6.05 ( 67Kops/s) add_0565_0565 = L1: 15.37 L2: 14.91 M: 11.83 ( 16.06%) HT: 10.53 VT: 10.15 R: 9.74 RT: 6.19 ( 68Kops/s) ARM/iwMMXt: add_0565_0565 = L1: 11.12 L2: 10.40 M: 8.82 ( 10.65%) HT: 7.98 VT: 7.41 R: 7.57 RT: 5.21 ( 54Kops/s) add_0565_0565 = L1: 12.87 L2: 11.58 M: 10.11 ( 12.50%) HT: 9.06 VT: 8.66 R: 7.70 RT: 5.62 ( 58Kops/s) --- pixman/loongson-mmintrin.h | 21 +++++++++++++++++++++ pixman/pixman-mmx.c | 44 ++++++++++++++++++++++++++++++++++++++------ 2 files changed, 59 insertions(+), 6 deletions(-) diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h index 8295ba0..1a114fe 100644 --- a/pixman/loongson-mmintrin.h +++ b/pixman/loongson-mmintrin.h @@ -77,6 +77,17 @@ _mm_and_si64 (__m64 __m1, __m64 __m2) return ret; } +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("pcmpeqw %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_empty (void) { @@ -150,6 +161,16 @@ _mm_shuffle_pi16 (__m64 __m, int64_t __n) } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_pi16 (__m64 __m, int64_t __count) +{ + __m64 ret; + asm("psllh %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m), "f" (*(__m64 *)&__count) + ); + return ret; +} +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_si64 (__m64 __m, int64_t __count) { __m64 ret; diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c index eb02d1a..af34755 100644 --- a/pixman/pixman-mmx.c +++ b/pixman/pixman-mmx.c @@ -185,6 +185,9 @@ typedef struct mmxdatafield mmx_565_b; mmxdatafield mmx_packed_565_rb; mmxdatafield mmx_packed_565_g; + mmxdatafield mmx_expand_565_g; + mmxdatafield mmx_expand_565_b; + mmxdatafield mmx_expand_565_r; #ifndef USE_LOONGSON_MMI mmxdatafield mmx_mask_0; mmxdatafield mmx_mask_1; @@ -216,6 +219,9 @@ static const mmx_data_t c = MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8), MMXDATA_INIT (.mmx_packed_565_rb, 0x00f800f800f800f8), MMXDATA_INIT (.mmx_packed_565_g, 0x0000fc000000fc00), + MMXDATA_INIT (.mmx_expand_565_g, 0x07e007e007e007e0), + MMXDATA_INIT (.mmx_expand_565_b, 0x001f001f001f001f), + MMXDATA_INIT (.mmx_expand_565_r, 0xf800f800f800f800), #ifndef USE_LOONGSON_MMI MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000), MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff), @@ -518,6 +524,34 @@ expand565 (__m64 pixel, int pos) return _mm_srli_pi16 (pixel, 8); } +/* Expand 4 16 bit pixels in an mmx register into two mmx registers of + * + * AARRGGBBRRGGBB + */ +static force_inline void +expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1) +{ + __m64 t0, t1, alpha = _mm_cmpeq_pi32 (_mm_setzero_si64 (), _mm_setzero_si64 ()); + __m64 r = _mm_and_si64 (vin, MC (expand_565_r)); + __m64 g = _mm_and_si64 (vin, MC (expand_565_g)); + __m64 b = _mm_and_si64 (vin, MC (expand_565_b)); + + /* Replicate high bits into empty low bits. */ + r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13)); + g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9)); + b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2)); + + r = _mm_packs_pu16 (r, _mm_setzero_si64 ()); /* 00 00 00 00 R3 R2 R1 R0 */ + g = _mm_packs_pu16 (g, _mm_setzero_si64 ()); /* 00 00 00 00 G3 G2 G1 G0 */ + b = _mm_packs_pu16 (b, _mm_setzero_si64 ()); /* 00 00 00 00 B3 B2 B1 B0 */ + + t1 = _mm_unpacklo_pi8 (r, alpha); /* A3 R3 A2 R2 A1 R1 A0 R0 */ + t0 = _mm_unpacklo_pi8 (b, g); /* G3 B3 G2 B2 G1 B1 G0 B0 */ + + *vout0 = _mm_unpacklo_pi16 (t0, t1); /* A1 R1 G1 B1 A0 R0 G0 B0 */ + *vout1 = _mm_unpackhi_pi16 (t0, t1); /* A3 R3 G3 B3 A2 R2 G2 B2 */ +} + static force_inline __m64 expand8888 (__m64 in, int pos) { @@ -3346,14 +3380,12 @@ mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask) while (w >= 4) { __m64 vsrc = ldq_u ((__m64 *)src); + __m64 mm0, mm1; - __m64 mm0 = expand565 (vsrc, 0); - __m64 mm1 = expand565 (vsrc, 1); - __m64 mm2 = expand565 (vsrc, 2); - __m64 mm3 = expand565 (vsrc, 3); + expand_4xpacked565 (vsrc, &mm0, &mm1); - *(__m64 *)(dst + 0) = _mm_or_si64 (pack8888 (mm0, mm1), MC (ff000000)); - *(__m64 *)(dst + 2) = _mm_or_si64 (pack8888 (mm2, mm3), MC (ff000000)); + *(__m64 *)(dst + 0) = mm0; + *(__m64 *)(dst + 2) = mm1; dst += 4; src += 4; -- 2.7.4