From c748650d700c2f18f1587f06ada3b58d6ddc18d3 Mon Sep 17 00:00:00 2001 From: Siarhei Siamashka Date: Wed, 8 Sep 2010 09:30:23 +0300 Subject: [PATCH] Use more unrolling for scaled src_0565_0565 with nearest filter Benchmark from Intel Core i7 860: == before == op=1, src_fmt=10020565, dst_fmt=10020565, speed=1335.29 MPix/s == after == op=1, src_fmt=10020565, dst_fmt=10020565, speed=1550.96 MPix/s == performance of nonscaled src_0565_0565 operation as a reference == op=1, src_fmt=10020565, dst_fmt=10020565, speed=2401.31 MPix/s Benchmark from ARM Cortex-A8: == before == op=1, src_fmt=10020565, dst_fmt=10020565, speed=81.79 MPix/s == after == op=1, src_fmt=10020565, dst_fmt=10020565, speed=89.55 MPix/s == performance of nonscaled src_0565_0565 operation as a reference == op=1, src_fmt=10020565, dst_fmt=10020565, speed=197.44 MPix/s --- pixman/pixman-fast-path.c | 51 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c index c210919..5d5fa95 100644 --- a/pixman/pixman-fast-path.c +++ b/pixman/pixman-fast-path.c @@ -1399,15 +1399,60 @@ FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER); FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE); FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD); FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL); -FAST_NEAREST (565_565_cover, 0565, 0565, uint16_t, uint16_t, SRC, COVER); -FAST_NEAREST (565_565_none, 0565, 0565, uint16_t, uint16_t, SRC, NONE); -FAST_NEAREST (565_565_pad, 0565, 0565, uint16_t, uint16_t, SRC, PAD); FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL); FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER); FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE); FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD); FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL); +/* Use more unrolling for src_0565_0565 because it is typically CPU bound */ +static force_inline void +scaled_nearest_scanline_565_565_SRC (uint16_t * dst, + uint16_t * src, + int32_t w, + pixman_fixed_t vx, + pixman_fixed_t unit_x, + pixman_fixed_t max_vx) +{ + uint16_t tmp1, tmp2, tmp3, tmp4; + while ((w -= 4) >= 0) + { + tmp1 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp2 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp3 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp4 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + *dst++ = tmp1; + *dst++ = tmp2; + *dst++ = tmp3; + *dst++ = tmp4; + } + if (w & 2) + { + tmp1 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + tmp2 = src[pixman_fixed_to_int (vx)]; + vx += unit_x; + *dst++ = tmp1; + *dst++ = tmp2; + } + if (w & 1) + *dst++ = src[pixman_fixed_to_int (vx)]; +} + +FAST_NEAREST_MAINLOOP (565_565_cover_SRC, + scaled_nearest_scanline_565_565_SRC, + uint16_t, uint16_t, COVER); +FAST_NEAREST_MAINLOOP (565_565_none_SRC, + scaled_nearest_scanline_565_565_SRC, + uint16_t, uint16_t, NONE); +FAST_NEAREST_MAINLOOP (565_565_pad_SRC, + scaled_nearest_scanline_565_565_SRC, + uint16_t, uint16_t, PAD); + static force_inline uint32_t fetch_nearest (pixman_repeat_t src_repeat, pixman_format_code_t format, -- 2.7.4