From: Siarhei Siamashka Date: Tue, 3 Sep 2013 01:39:54 +0000 (+0300) Subject: sse2: faster bilinear scaling (pack 4 pixels to write with MOVDQA) X-Git-Tag: pixman-0.31.2~39 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=700db9d872bdc49399a95565ffe0d345db11717a;p=platform%2Fupstream%2Fpixman.git sse2: faster bilinear scaling (pack 4 pixels to write with MOVDQA) The loops are already unrolled, so it was just a matter of packing 4 pixels into a single XMM register and doing aligned 128-bit writes to memory via MOVDQA instructions for the SRC compositing operator fast path. For the other fast paths, this XMM register is also directly routed to further processing instead of doing extra reshuffling. This replaces "8 PACKSSDW/PACKUSWB + 4 MOVD" instructions with "3 PACKSSDW/PACKUSWB + 1 MOVDQA" per 4 pixels, which results in a clear performance improvement. There are also some other (less important) tweaks: 1. Convert 'pixman_fixed_t' to 'intptr_t' before using it as an index for addressing memory. The problem is that 'pixman_fixed_t' is a 32-bit data type and it has to be extended to 64-bit offsets, which needs extra instructions on 64-bit systems. 2. Allow to recalculate the horizontal interpolation weights only once per 4 pixels by treating the XMM register as four pairs of 16-bit values. Each of these 16-bit/16-bit pairs can be replicated to fill the whole 128-bit register by using PSHUFD instructions. So we get "3 PADDW/PSRLW + 4 PSHUFD" instructions per 4 pixels instead of "12 PADDW/PSRLW" per 4 pixels (or "3 PADDW/PSRLW" per each pixel). Now a good question is whether replacing "9 PADDW/PSRLW" with "4 PSHUFD" is a favourable exchange. As it turns out, PSHUFD instructions are very fast on new Intel processors (including Atoms), but are rather slow on the first generation of Core2 (Merom) and on the other processors from that time or older. A good instructions latency/throughput table, covering all the relevant processors, can be found at: http://www.agner.org/optimize/instruction_tables.pdf Enabling this optimization is controlled by the PSHUFD_IS_FAST define in "pixman-sse2.c". 3. One use of PSHUFD instruction (_mm_shuffle_epi32 intrinsic) in the older code has been also replaced by PUNPCKLQDQ equivalent (_mm_unpacklo_epi64 intrinsic) in PSHUFD_IS_FAST=0 configuration. The PUNPCKLQDQ instruction is usually faster on older processors, but has some side effects (instead of fully overwriting the destination register like PSHUFD does, it retains half of the original value, which may inhibit some compiler optimizations). Benchmarks with "lowlevel-blt-bench -b src_8888_8888" using GCC 4.8.1 on x86-64 system and default optimizations. The results are in MPix/s: ====== Intel Core2 T7300 (2GHz) ====== old: src_8888_8888 = L1: 128.69 L2: 125.07 M:124.86 over_8888_8888 = L1: 83.19 L2: 81.73 M: 80.63 over_8888_n_8888 = L1: 79.56 L2: 78.61 M: 77.85 over_8888_8_8888 = L1: 77.15 L2: 75.79 M: 74.63 new (PSHUFD_IS_FAST=0): src_8888_8888 = L1: 168.67 L2: 163.26 M:162.44 over_8888_8888 = L1: 102.91 L2: 100.43 M: 99.01 over_8888_n_8888 = L1: 97.40 L2: 95.64 M: 94.24 over_8888_8_8888 = L1: 98.04 L2: 95.83 M: 94.33 new (PSHUFD_IS_FAST=1): src_8888_8888 = L1: 154.67 L2: 149.16 M:148.48 over_8888_8888 = L1: 95.97 L2: 93.90 M: 91.85 over_8888_n_8888 = L1: 93.18 L2: 91.47 M: 90.15 over_8888_8_8888 = L1: 95.33 L2: 93.32 M: 91.42 ====== Intel Core i7 860 (2.8GHz) ====== old: src_8888_8888 = L1: 323.48 L2: 318.86 M:314.81 over_8888_8888 = L1: 187.38 L2: 186.74 M:182.46 new (PSHUFD_IS_FAST=0): src_8888_8888 = L1: 373.06 L2: 370.94 M:368.32 over_8888_8888 = L1: 217.28 L2: 215.57 M:211.32 new (PSHUFD_IS_FAST=1): src_8888_8888 = L1: 401.98 L2: 397.65 M:395.61 over_8888_8888 = L1: 218.89 L2: 217.56 M:213.48 The most interesting benchmark is "src_8888_8888" (because this code can be reused for a generic non-separable SSE2 bilinear fetch iterator). The results shows that PSHUFD instructions are bad for Intel Core2 T7300 (Merom core) and good for Intel Core i7 860 (Nehalem core). Both of these processors support SSSE3 instructions though, so they are not the primary targets for SSE2 code. But without having any other more relevant hardware to test, PSHUFD_IS_FAST=0 seems to be a reasonable default for SSE2 code and old processors (until the runtime CPU features detection becomes clever enough to recognize different microarchitectures). (Rebased on top of patch that removes support for 8-bit bilinear filtering -ssp) --- diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c index a629565..42c7209 100644 --- a/pixman/pixman-sse2.c +++ b/pixman/pixman-sse2.c @@ -30,6 +30,9 @@ #include #endif +/* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */ +#define PSHUFD_IS_FAST 0 + #include /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ #include /* for SSE2 intrinsics */ #include "pixman-private.h" @@ -5554,50 +5557,134 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER, scaled_nearest_scanline_sse2_8888_n_8888_OVER, uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE) +#if PSHUFD_IS_FAST + +/***********************************************************************************/ + # define BILINEAR_DECLARE_VARIABLES \ const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \ const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \ const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \ - const __m128i xmm_ux = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x, \ + const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x, \ + unit_x, -unit_x, unit_x, -unit_x); \ + const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4, \ + unit_x * 4, -unit_x * 4, \ + unit_x * 4, -unit_x * 4, \ + unit_x * 4, -unit_x * 4); \ + const __m128i xmm_zero = _mm_setzero_si128 (); \ + __m128i xmm_x = _mm_set_epi16 (vx + unit_x * 3, -(vx + 1) - unit_x * 3, \ + vx + unit_x * 2, -(vx + 1) - unit_x * 2, \ + vx + unit_x * 1, -(vx + 1) - unit_x * 1, \ + vx + unit_x * 0, -(vx + 1) - unit_x * 0); \ + __m128i xmm_wh_state; + +#define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase_) \ +do { \ + int phase = phase_; \ + __m128i xmm_wh, xmm_a, xmm_b; \ + /* fetch 2x2 pixel block into sse2 registers */ \ + __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]); \ + __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]); \ + vx += unit_x; \ + /* vertical interpolation */ \ + xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt); \ + xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb); \ + xmm_a = _mm_add_epi16 (xmm_a, xmm_b); \ + /* calculate horizontal weights */ \ + if (phase <= 0) \ + { \ + xmm_wh_state = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x, \ + 16 - BILINEAR_INTERPOLATION_BITS)); \ + xmm_x = _mm_add_epi16 (xmm_x, (phase < 0) ? xmm_ux1 : xmm_ux4); \ + phase = 0; \ + } \ + xmm_wh = _mm_shuffle_epi32 (xmm_wh_state, _MM_SHUFFLE (phase, phase, \ + phase, phase)); \ + /* horizontal interpolation */ \ + xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \ + xmm_a, _MM_SHUFFLE (1, 0, 3, 2)), xmm_a), xmm_wh); \ + /* shift the result */ \ + pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2); \ +} while (0) + +#else /************************************************************************/ + +# define BILINEAR_DECLARE_VARIABLES \ + const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \ + const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \ + const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \ + const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x, \ unit_x, -unit_x, unit_x, -unit_x); \ + const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4, \ + unit_x * 4, -unit_x * 4, \ + unit_x * 4, -unit_x * 4, \ + unit_x * 4, -unit_x * 4); \ const __m128i xmm_zero = _mm_setzero_si128 (); \ __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1), \ vx, -(vx + 1), vx, -(vx + 1)) -#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \ +#define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase) \ do { \ - __m128i xmm_wh, a; \ + __m128i xmm_wh, xmm_a, xmm_b; \ + (void)xmm_ux4; /* suppress warning: unused variable 'xmm_ux4' */ \ /* fetch 2x2 pixel block into sse2 registers */ \ - __m128i tltr = _mm_loadl_epi64 ( \ - (__m128i *)&src_top[pixman_fixed_to_int (vx)]); \ - __m128i blbr = _mm_loadl_epi64 ( \ - (__m128i *)&src_bottom[pixman_fixed_to_int (vx)]); \ + __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]); \ + __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]); \ vx += unit_x; \ /* vertical interpolation */ \ - a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), \ - xmm_wt), \ - _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), \ - xmm_wb)); \ + xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt); \ + xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb); \ + xmm_a = _mm_add_epi16 (xmm_a, xmm_b); \ /* calculate horizontal weights */ \ xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x, \ - 16 - BILINEAR_INTERPOLATION_BITS)); \ - xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ + 16 - BILINEAR_INTERPOLATION_BITS)); \ + xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1); \ /* horizontal interpolation */ \ - a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \ - a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh); \ - /* shift and pack the result */ \ - a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2); \ - a = _mm_packs_epi32 (a, a); \ - a = _mm_packus_epi16 (a, a); \ - pix = _mm_cvtsi128_si32 (a); \ + xmm_b = _mm_unpacklo_epi64 (/* any value is fine here */ xmm_b, xmm_a); \ + xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (xmm_b, xmm_a), xmm_wh); \ + /* shift the result */ \ + pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2); \ } while (0) +/***********************************************************************************/ + +#endif + +#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix); \ +do { \ + __m128i xmm_pix; \ + BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix, -1); \ + xmm_pix = _mm_packs_epi32 (xmm_pix, xmm_pix); \ + xmm_pix = _mm_packus_epi16 (xmm_pix, xmm_pix); \ + pix = _mm_cvtsi128_si32 (xmm_pix); \ +} while(0) + +#define BILINEAR_INTERPOLATE_FOUR_PIXELS(pix); \ +do { \ + __m128i xmm_pix1, xmm_pix2, xmm_pix3, xmm_pix4; \ + BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix1, 0); \ + BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix2, 1); \ + BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix3, 2); \ + BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix4, 3); \ + xmm_pix1 = _mm_packs_epi32 (xmm_pix1, xmm_pix2); \ + xmm_pix3 = _mm_packs_epi32 (xmm_pix3, xmm_pix4); \ + pix = _mm_packus_epi16 (xmm_pix1, xmm_pix3); \ +} while(0) + #define BILINEAR_SKIP_ONE_PIXEL() \ do { \ vx += unit_x; \ - xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ + xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1); \ } while(0) +#define BILINEAR_SKIP_FOUR_PIXELS() \ +do { \ + vx += unit_x * 4; \ + xmm_x = _mm_add_epi16 (xmm_x, xmm_ux4); \ +} while(0) + +/***********************************************************************************/ + static force_inline void scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst, const uint32_t * mask, @@ -5606,24 +5693,28 @@ scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst, int32_t w, int wt, int wb, - pixman_fixed_t vx, - pixman_fixed_t unit_x, + pixman_fixed_t vx_, + pixman_fixed_t unit_x_, pixman_fixed_t max_vx, pixman_bool_t zero_src) { + intptr_t vx = vx_; + intptr_t unit_x = unit_x_; BILINEAR_DECLARE_VARIABLES; - uint32_t pix1, pix2, pix3, pix4; + uint32_t pix1, pix2; - while ((w -= 4) >= 0) + while (w && ((uintptr_t)dst & 15)) { BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); - BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); - BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); - BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); *dst++ = pix1; - *dst++ = pix2; - *dst++ = pix3; - *dst++ = pix4; + w--; + } + + while ((w -= 4) >= 0) { + __m128i xmm_src; + BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src); + _mm_store_si128 ((__m128i *)dst, xmm_src); + dst += 4; } if (w & 2) @@ -5667,13 +5758,15 @@ scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst, int32_t w, int wt, int wb, - pixman_fixed_t vx, - pixman_fixed_t unit_x, + pixman_fixed_t vx_, + pixman_fixed_t unit_x_, pixman_fixed_t max_vx, pixman_bool_t zero_src) { + intptr_t vx = vx_; + intptr_t unit_x = unit_x_; BILINEAR_DECLARE_VARIABLES; - uint32_t pix1, pix2, pix3, pix4; + uint32_t pix1, pix2; while (w && ((uintptr_t)dst & 15)) { @@ -5695,12 +5788,7 @@ scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst, __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo; __m128i xmm_alpha_hi, xmm_alpha_lo; - BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); - BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); - BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); - BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); - - xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); + BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src); if (!is_zero (xmm_src)) { @@ -5767,13 +5855,15 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst, int32_t w, int wt, int wb, - pixman_fixed_t vx, - pixman_fixed_t unit_x, + pixman_fixed_t vx_, + pixman_fixed_t unit_x_, pixman_fixed_t max_vx, pixman_bool_t zero_src) { + intptr_t vx = vx_; + intptr_t unit_x = unit_x_; BILINEAR_DECLARE_VARIABLES; - uint32_t pix1, pix2, pix3, pix4; + uint32_t pix1, pix2; uint32_t m; while (w && ((uintptr_t)dst & 15)) @@ -5824,12 +5914,7 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst, if (m) { - BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); - BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); - BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); - BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); - - xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); + BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src); if (m == 0xffffffff && is_opaque (xmm_src)) { @@ -5856,10 +5941,7 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst, } else { - BILINEAR_SKIP_ONE_PIXEL (); - BILINEAR_SKIP_ONE_PIXEL (); - BILINEAR_SKIP_ONE_PIXEL (); - BILINEAR_SKIP_ONE_PIXEL (); + BILINEAR_SKIP_FOUR_PIXELS (); } w -= 4; @@ -5931,13 +6013,15 @@ scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t * dst, int32_t w, int wt, int wb, - pixman_fixed_t vx, - pixman_fixed_t unit_x, + pixman_fixed_t vx_, + pixman_fixed_t unit_x_, pixman_fixed_t max_vx, pixman_bool_t zero_src) { + intptr_t vx = vx_; + intptr_t unit_x = unit_x_; BILINEAR_DECLARE_VARIABLES; - uint32_t pix1, pix2, pix3, pix4; + uint32_t pix1; __m128i xmm_mask; if (zero_src || (*mask >> 24) == 0) @@ -5967,19 +6051,15 @@ scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t * dst, while (w >= 4) { - BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); - BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); - BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); - BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); + __m128i xmm_src; + BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src); - if (pix1 | pix2 | pix3 | pix4) + if (!is_zero (xmm_src)) { - __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; __m128i xmm_alpha_lo, xmm_alpha_hi; - xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); - xmm_dst = load_128_aligned ((__m128i*)dst); unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);