From a47ed2c31180e6c3b332747a1721731e0649b10f Mon Sep 17 00:00:00 2001 From: Siarhei Siamashka Date: Mon, 3 Dec 2012 17:42:21 +0200 Subject: [PATCH] Faster fetch for the C variant of r5g6b5 src/dest iterator Processing two pixels at once is used to reduce the number of arithmetic operations. The speedup relative to the generic fetch_scanline_r5g6b5() from "pixman-access.c" (pixman was compiled with gcc 4.7.2): MIPS 74K 480MHz : 20.32 MPix/s -> 26.47 MPix/s ARM11 700MHz : 34.95 MPix/s -> 38.22 MPix/s ARM Cortex-A8 1000MHz : 87.44 MPix/s -> 100.92 MPix/s ARM Cortex-A9 1700MHz : 150.95 MPix/s -> 158.13 MPix/s ARM Cortex-A15 1700MHz : 148.91 MPix/s -> 155.42 MPix/s IBM Cell PPU 3200MHz : 75.29 MPix/s -> 98.33 MPix/s Intel Core i7 2800MHz : 257.02 MPix/s -> 376.93 MPix/s That's the performance for C code (SIMD and assembly optimizations are disabled via PIXMAN_DISABLE environment variable). --- pixman/pixman-fast-path.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c index 02a5119..247aea6 100644 --- a/pixman/pixman-fast-path.c +++ b/pixman/pixman-fast-path.c @@ -2170,11 +2170,40 @@ fast_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask) iter->bits += iter->stride; - while (w > 0) + /* Align the source buffer at 4 bytes boundary */ + if (w > 0 && ((uintptr_t)src & 3)) { *dst++ = convert_0565_to_8888 (*src++); w--; } + /* Process two pixels per iteration */ + while ((w -= 2) >= 0) + { + uint32_t sr, sb, sg, t0, t1; + uint32_t s = *(const uint32_t *)src; + src += 2; + sr = (s >> 8) & 0x00F800F8; + sb = (s << 3) & 0x00F800F8; + sg = (s >> 3) & 0x00FC00FC; + sr |= sr >> 5; + sb |= sb >> 5; + sg |= sg >> 6; + t0 = ((sr << 16) & 0x00FF0000) | ((sg << 8) & 0x0000FF00) | + (sb & 0xFF) | 0xFF000000; + t1 = (sr & 0x00FF0000) | ((sg >> 8) & 0x0000FF00) | + (sb >> 16) | 0xFF000000; +#ifdef WORDS_BIGENDIAN + *dst++ = t1; + *dst++ = t0; +#else + *dst++ = t0; + *dst++ = t1; +#endif + } + if (w & 1) + { + *dst = convert_0565_to_8888 (*src); + } return iter->buffer; } -- 2.7.4