sse2: bilinear fast path for src_x888_8888
authorSiarhei Siamashka <siarhei.siamashka@gmail.com>
Wed, 2 Oct 2013 00:54:30 +0000 (00:54 +0000)
committerSiarhei Siamashka <siarhei.siamashka@gmail.com>
Sun, 13 Oct 2013 21:26:51 +0000 (00:26 +0300)
Running cairo-perf-trace benchmark on Intel Core2 T7300:

Before:
[  0]    image    t-firefox-canvas-swscroll    1.989    2.008   0.43%    8/8
[  1]    image        firefox-canvas-scroll    4.574    4.609   0.50%    8/8

After:
[  0]    image    t-firefox-canvas-swscroll    1.404    1.418   0.51%    8/8
[  1]    image        firefox-canvas-scroll    4.228    4.259   0.36%    8/8

pixman/pixman-sse2.c

index 2ab2690..a6e7808 100644 (file)
@@ -5751,6 +5751,66 @@ FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
                               NORMAL, FLAG_NONE)
 
 static force_inline void
+scaled_bilinear_scanline_sse2_x888_8888_SRC (uint32_t *       dst,
+                                            const uint32_t * mask,
+                                            const uint32_t * src_top,
+                                            const uint32_t * src_bottom,
+                                            int32_t          w,
+                                            int              wt,
+                                            int              wb,
+                                            pixman_fixed_t   vx_,
+                                            pixman_fixed_t   unit_x_,
+                                            pixman_fixed_t   max_vx,
+                                            pixman_bool_t    zero_src)
+{
+    intptr_t vx = vx_;
+    intptr_t unit_x = unit_x_;
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1, pix2;
+
+    while (w && ((uintptr_t)dst & 15))
+    {
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+       *dst++ = pix1 | 0xFF000000;
+       w--;
+    }
+
+    while ((w -= 4) >= 0) {
+       __m128i xmm_src;
+       BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
+       _mm_store_si128 ((__m128i *)dst, _mm_or_si128 (xmm_src, mask_ff000000));
+       dst += 4;
+    }
+
+    if (w & 2)
+    {
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+       *dst++ = pix1 | 0xFF000000;
+       *dst++ = pix2 | 0xFF000000;
+    }
+
+    if (w & 1)
+    {
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+       *dst = pix1 | 0xFF000000;
+    }
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_cover_SRC,
+                              scaled_bilinear_scanline_sse2_x888_8888_SRC,
+                              uint32_t, uint32_t, uint32_t,
+                              COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_pad_SRC,
+                              scaled_bilinear_scanline_sse2_x888_8888_SRC,
+                              uint32_t, uint32_t, uint32_t,
+                              PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_normal_SRC,
+                              scaled_bilinear_scanline_sse2_x888_8888_SRC,
+                              uint32_t, uint32_t, uint32_t,
+                              NORMAL, FLAG_NONE)
+
+static force_inline void
 scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
                                              const uint32_t * mask,
                                              const uint32_t * src_top,
@@ -6247,6 +6307,13 @@ static const pixman_fast_path_t sse2_fast_paths[] =
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
 
+    SIMPLE_BILINEAR_FAST_PATH_COVER  (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
+    SIMPLE_BILINEAR_FAST_PATH_COVER  (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
+    SIMPLE_BILINEAR_FAST_PATH_PAD    (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
+    SIMPLE_BILINEAR_FAST_PATH_PAD    (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
+    SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
+    SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
+
     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),