From 08e855f15cba24aac83145b994069d0bb50be5a1 Mon Sep 17 00:00:00 2001 From: =?utf8?q?S=C3=B8ren=20Sandmann=20Pedersen?= Date: Sun, 23 Jan 2011 16:17:17 -0500 Subject: [PATCH] Add SSE2 fetcher for a8 New output of lowlevel-blt-bench over_x888_8_0565: over_x888_8_0565 = L1: 57.85 L2: 56.80 M: 54.14 ( 19.50%) HT: 42.64 VT: 40.56 R: 32.67 RT: 16.22 ( 195Kops/s) Based in part on code by Steve Snyder from https://bugs.freedesktop.org/show_bug.cgi?id=21173 --- pixman/pixman-sse2.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c index 10a3dd0..d5ea14b 100644 --- a/pixman/pixman-sse2.c +++ b/pixman/pixman-sse2.c @@ -5989,6 +5989,52 @@ sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask) return iter->buffer; } +static uint32_t * +sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) +{ + int w = iter->width; + uint32_t *dst = iter->buffer; + uint8_t *src = iter->bits; + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; + + iter->bits += iter->stride; + + while (w && (((unsigned long)dst) & 15)) + { + *dst++ = *(src++) << 24; + w--; + } + + while (w >= 16) + { + xmm0 = _mm_loadu_si128((__m128i *)src); + + xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0); + xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0); + xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1); + xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1); + xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2); + xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2); + + _mm_store_si128(((__m128i *)(dst + 0)), xmm3); + _mm_store_si128(((__m128i *)(dst + 4)), xmm4); + _mm_store_si128(((__m128i *)(dst + 8)), xmm5); + _mm_store_si128(((__m128i *)(dst + 12)), xmm6); + + dst += 16; + src += 16; + w -= 16; + } + + while (w) + { + *dst++ = *(src++) << 24; + w--; + } + + return iter->buffer; +} + typedef struct { pixman_format_code_t format; @@ -5997,7 +6043,8 @@ typedef struct static const fetcher_info_t fetchers[] = { - { PIXMAN_x8r8g8b8, sse2_fetch_x8r8g8b8 }, + { PIXMAN_x8r8g8b8, sse2_fetch_x8r8g8b8 }, + { PIXMAN_a8, sse2_fetch_a8 }, { PIXMAN_null } }; -- 2.7.4