From e66fd5ccb6b69dfa1acde36220dc3c3c44026890 Mon Sep 17 00:00:00 2001 From: Siarhei Siamashka Date: Mon, 3 Dec 2012 17:07:31 +0200 Subject: [PATCH] Faster write-back for the C variant of r5g6b5 dest iterator Unrolling loops improves performance, so just use it here. Also GCC can't properly optimize this code for RISC processors and allocate 0x1F001F constant in a register. Because this constant is too large to be represented as an immediate operand in instructions, GCC inserts some redundant arithmetics. This problem can be workarounded by explicitly using a variable for 0x1F001F constant and also initializing it by a read from another volatile variable. In this case GCC is forced to allocate a register for it, because it is not seen as a constant anymore. The speedup relative to the generic store_scanline_r5g6b5() from "pixman-access.c" (pixman was compiled with gcc 4.7.2): MIPS 74K 480MHz : 33.22 MPix/s -> 43.42 MPix/s ARM11 700MHz : 50.16 MPix/s -> 78.23 MPix/s ARM Cortex-A8 1000MHz : 117.75 MPix/s -> 196.34 MPix/s ARM Cortex-A9 1700MHz : 177.04 MPix/s -> 320.32 MPix/s ARM Cortex-A15 1700MHz : 231.44 MPix/s -> 261.64 MPix/s IBM Cell PPU 3200MHz : 130.25 MPix/s -> 145.61 MPix/s Intel Core i7 2800MHz : 502.21 MPix/s -> 721.73 MPix/s That's the performance for C code (SIMD and assembly optimizations are disabled via PIXMAN_DISABLE environment variable). --- pixman/pixman-fast-path.c | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c index cbe34bb..02a5119 100644 --- a/pixman/pixman-fast-path.c +++ b/pixman/pixman-fast-path.c @@ -2186,17 +2186,49 @@ fast_dest_fetch_noop (pixman_iter_t *iter, const uint32_t *mask) return iter->buffer; } +/* Helper function for a workaround, which tries to ensure that 0x1F001F + * constant is always allocated in a register on RISC architectures. + */ +static force_inline uint32_t +convert_8888_to_0565_workaround (uint32_t s, uint32_t x1F001F) +{ + uint32_t a, b; + a = (s >> 3) & x1F001F; + b = s & 0xFC00; + a |= a >> 5; + a |= b >> 5; + return a; +} + static void fast_write_back_r5g6b5 (pixman_iter_t *iter) { int32_t w = iter->width; uint16_t *dst = (uint16_t *)(iter->bits - iter->stride); const uint32_t *src = iter->buffer; + /* Workaround to ensure that x1F001F variable is allocated in a register */ + static volatile uint32_t volatile_x1F001F = 0x1F001F; + uint32_t x1F001F = volatile_x1F001F; - while (w > 0) + while ((w -= 4) >= 0) { - *dst++ = convert_8888_to_0565 (*src++); - w--; + uint32_t s1 = *src++; + uint32_t s2 = *src++; + uint32_t s3 = *src++; + uint32_t s4 = *src++; + *dst++ = convert_8888_to_0565_workaround (s1, x1F001F); + *dst++ = convert_8888_to_0565_workaround (s2, x1F001F); + *dst++ = convert_8888_to_0565_workaround (s3, x1F001F); + *dst++ = convert_8888_to_0565_workaround (s4, x1F001F); + } + if (w & 2) + { + *dst++ = convert_8888_to_0565_workaround (*src++, x1F001F); + *dst++ = convert_8888_to_0565_workaround (*src++, x1F001F); + } + if (w & 1) + { + *dst = convert_8888_to_0565_workaround (*src, x1F001F); } } -- 2.7.4