From 3518a0dafa63098d41e466f73d105b7e3e4b12de Mon Sep 17 00:00:00 2001 From: =?utf8?q?S=C3=B8ren=20Sandmann=20Pedersen?= Date: Thu, 24 May 2012 02:49:05 -0400 Subject: [PATCH] Add an iterator that can fetch bilinearly scaled images This new iterator works in a separable way; that is, for a destination scaline, it scales the two involved source scanlines and then caches them so that they can be reused for the next destination scanlines. There are two versions of the code, one that uses 64 bit arithmetic, and one that uses 32 bit arithmetic only. The latter version is used on 32 bit systems, where it is expected to be faster. This scheme saves a substantial amount of arithmetic for larger scalings; the per-pixel times for various configurations as reported by scaling-bench are graphed here: http://people.freedesktop.org/~sandmann/separable.v2/v2.png The "sse2" graph is current default on x86, "mmx" is with sse2 disabled, "old c" is with sse2 and mmx disabled. The "new 32" and "new 64" graphs show times for the new code. As the graphs show, the 64 bit version of the new code beats the "old c" for all scaling ratios. The data was taken on a Sandy Bridge Core i3-2350M CPU @ 2.0 GHz running in 64 bit mode. The data used to generate the graph is available in this directory: http://people.freedesktop.org/~sandmann/separable.v2/ There is also a Gnumeric spreadsheet v2.gnumeric containing the per-pixel values and the graph. V2: - Add error message in the OOM/bad matrix case - Save some shifts by storing the cached scanlines in AGBR order - Special cased version that uses 32 bit arithmetic when sizeof(long) <= 4 --- pixman/pixman-fast-path.c | 241 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 241 insertions(+) diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c index 3982dce..2608268 100644 --- a/pixman/pixman-fast-path.c +++ b/pixman/pixman-fast-path.c @@ -2261,6 +2261,237 @@ fast_write_back_r5g6b5 (pixman_iter_t *iter) } } +typedef struct +{ + int y; + uint64_t * buffer; +} line_t; + +typedef struct +{ + line_t line0; + line_t line1; + pixman_fixed_t y; + pixman_fixed_t x; + uint64_t data[1]; +} bilinear_info_t; + +static void +fetch_horizontal (bits_image_t *image, line_t *line, + int y, pixman_fixed_t x, pixman_fixed_t ux, int n) +{ + uint32_t *bits = image->bits + y * image->rowstride; + int i; + + for (i = 0; i < n; ++i) + { + int x0 = pixman_fixed_to_int (x); + int x1 = x0 + 1; + int32_t dist_x; + + uint32_t left = *(bits + x0); + uint32_t right = *(bits + x1); + + dist_x = pixman_fixed_to_bilinear_weight (x); + dist_x <<= (8 - BILINEAR_INTERPOLATION_BITS); + +#if SIZEOF_LONG <= 4 + { + uint32_t lag, rag, ag; + uint32_t lrb, rrb, rb; + + lag = (left & 0xff00ff00) >> 8; + rag = (right & 0xff00ff00) >> 8; + ag = (lag << 8) + dist_x * (rag - lag); + + lrb = (left & 0x00ff00ff); + rrb = (right & 0x00ff00ff); + rb = (lrb << 8) + dist_x * (rrb - lrb); + + *((uint32_t *)(line->buffer + i)) = ag; + *((uint32_t *)(line->buffer + i) + 1) = rb; + } +#else + { + uint64_t lagrb, ragrb; + uint32_t lag, rag; + uint32_t lrb, rrb; + + lag = (left & 0xff00ff00); + lrb = (left & 0x00ff00ff); + rag = (right & 0xff00ff00); + rrb = (right & 0x00ff00ff); + lagrb = (((uint64_t)lag) << 24) | lrb; + ragrb = (((uint64_t)rag) << 24) | rrb; + + line->buffer[i] = (lagrb << 8) + dist_x * (ragrb - lagrb); + } +#endif + + x += ux; + } + + line->y = y; +} + +static uint32_t * +fast_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask) +{ + pixman_fixed_t fx, ux; + bilinear_info_t *info = iter->data; + line_t *line0, *line1; + int y0, y1; + int32_t dist_y; + int i; + + fx = info->x; + ux = iter->image->common.transform->matrix[0][0]; + + y0 = pixman_fixed_to_int (info->y); + y1 = y0 + 1; + dist_y = pixman_fixed_to_bilinear_weight (info->y); + dist_y <<= (8 - BILINEAR_INTERPOLATION_BITS); + + line0 = &info->line0; + line1 = &info->line1; + + if (line0->y != y0 || line1->y != y1) + { + if (line0->y == y1 || line1->y == y0) + { + line_t tmp = *line0; + *line0 = *line1; + *line1 = tmp; + } + + if (line0->y != y0) + { + fetch_horizontal ( + &iter->image->bits, line0, y0, fx, ux, iter->width); + } + + if (line1->y != y1) + { + fetch_horizontal ( + &iter->image->bits, line1, y1, fx, ux, iter->width); + } + } + + for (i = 0; i < iter->width; ++i) + { +#if SIZEOF_LONG <= 4 + uint32_t ta, tr, tg, tb; + uint32_t ba, br, bg, bb; + uint32_t tag, trb; + uint32_t bag, brb; + uint32_t a, r, g, b; + + tag = *((uint32_t *)(line0->buffer + i)); + trb = *((uint32_t *)(line0->buffer + i) + 1); + bag = *((uint32_t *)(line1->buffer + i)); + brb = *((uint32_t *)(line1->buffer + i) + 1); + + ta = tag >> 16; + ba = bag >> 16; + a = (ta << 8) + dist_y * (ba - ta); + + tr = trb >> 16; + br = brb >> 16; + r = (tr << 8) + dist_y * (br - tr); + + tg = tag & 0xffff; + bg = bag & 0xffff; + g = (tg << 8) + dist_y * (bg - tg); + + tb = trb & 0xffff; + bb = brb & 0xffff; + b = (tb << 8) + dist_y * (bb - tb); + + a = (a << 8) & 0xff000000; + r = (r << 0) & 0x00ff0000; + g = (g >> 8) & 0x0000ff00; + b = (b >> 16) & 0x000000ff; +#else + uint64_t top = line0->buffer[i]; + uint64_t bot = line1->buffer[i]; + uint64_t tar = (top & 0xffff0000ffff0000ULL) >> 16; + uint64_t bar = (bot & 0xffff0000ffff0000ULL) >> 16; + uint64_t tgb = (top & 0x0000ffff0000ffffULL); + uint64_t bgb = (bot & 0x0000ffff0000ffffULL); + uint64_t ar, gb; + uint32_t a, r, g, b; + + ar = (tar << 8) + dist_y * (bar - tar); + gb = (tgb << 8) + dist_y * (bgb - tgb); + + a = ((ar >> 24) & 0xff000000); + r = ((ar >> 0) & 0x00ff0000); + g = ((gb >> 40) & 0x0000ff00); + b = ((gb >> 16) & 0x000000ff); +#endif + + iter->buffer[i] = a | r | g | b; + } + + info->y += iter->image->common.transform->matrix[1][1]; + + return iter->buffer; +} + +static void +bilinear_cover_iter_fini (pixman_iter_t *iter) +{ + free (iter->data); +} + +static void +fast_bilinear_cover_iter_init (pixman_iter_t *iter, const pixman_iter_info_t *iter_info) +{ + int width = iter->width; + bilinear_info_t *info; + pixman_vector_t v; + + /* Reference point is the center of the pixel */ + v.vector[0] = pixman_int_to_fixed (iter->x) + pixman_fixed_1 / 2; + v.vector[1] = pixman_int_to_fixed (iter->y) + pixman_fixed_1 / 2; + v.vector[2] = pixman_fixed_1; + + if (!pixman_transform_point_3d (iter->image->common.transform, &v)) + goto fail; + + info = malloc (sizeof (*info) + (2 * width - 1) * sizeof (uint64_t)); + if (!info) + goto fail; + + info->x = v.vector[0] - pixman_fixed_1 / 2; + info->y = v.vector[1] - pixman_fixed_1 / 2; + + /* It is safe to set the y coordinates to -1 initially + * because COVER_CLIP_BILINEAR ensures that we will only + * be asked to fetch lines in the [0, height) interval + */ + info->line0.y = -1; + info->line0.buffer = &(info->data[0]); + info->line1.y = -1; + info->line1.buffer = &(info->data[width]); + + iter->get_scanline = fast_fetch_bilinear_cover; + iter->fini = bilinear_cover_iter_fini; + + iter->data = info; + return; + +fail: + /* Something went wrong, either a bad matrix or OOM; in such cases, + * we don't guarantee any particular rendering. + */ + _pixman_log_error ( + FUNC, "Allocation failure or bad matrix, skipping rendering\n"); + + iter->get_scanline = _pixman_iter_get_scanline_noop; + iter->fini = bilinear_cover_iter_fini; +} + #define IMAGE_FLAGS \ (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \ FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST) @@ -2280,6 +2511,16 @@ static const pixman_iter_info_t fast_iters[] = _pixman_iter_init_bits_stride, fast_dest_fetch_noop, fast_write_back_r5g6b5 }, + { PIXMAN_a8r8g8b8, + (FAST_PATH_STANDARD_FLAGS | + FAST_PATH_SCALE_TRANSFORM | + FAST_PATH_BILINEAR_FILTER | + FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR), + ITER_NARROW | ITER_SRC, + fast_bilinear_cover_iter_init, + NULL, NULL + }, + { PIXMAN_null }, }; -- 2.7.4