Use more unrolling for scaled src_0565_0565 with nearest filter
authorSiarhei Siamashka <siarhei.siamashka@nokia.com>
Wed, 8 Sep 2010 06:30:23 +0000 (09:30 +0300)
committerSiarhei Siamashka <siarhei.siamashka@nokia.com>
Sun, 10 Oct 2010 22:07:01 +0000 (01:07 +0300)
Benchmark from Intel Core i7 860:

    == before ==
    op=1, src_fmt=10020565, dst_fmt=10020565, speed=1335.29 MPix/s

    == after ==
    op=1, src_fmt=10020565, dst_fmt=10020565, speed=1550.96 MPix/s

    == performance of nonscaled src_0565_0565 operation as a reference ==
    op=1, src_fmt=10020565, dst_fmt=10020565, speed=2401.31 MPix/s

Benchmark from ARM Cortex-A8:

    == before ==
    op=1, src_fmt=10020565, dst_fmt=10020565, speed=81.79 MPix/s

    == after ==
    op=1, src_fmt=10020565, dst_fmt=10020565, speed=89.55 MPix/s

    == performance of nonscaled src_0565_0565 operation as a reference ==
    op=1, src_fmt=10020565, dst_fmt=10020565, speed=197.44 MPix/s

pixman/pixman-fast-path.c

index c210919..5d5fa95 100644 (file)
@@ -1399,15 +1399,60 @@ FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER);
 FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE);
 FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD);
 FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL);
-FAST_NEAREST (565_565_cover, 0565, 0565, uint16_t, uint16_t, SRC, COVER);
-FAST_NEAREST (565_565_none, 0565, 0565, uint16_t, uint16_t, SRC, NONE);
-FAST_NEAREST (565_565_pad, 0565, 0565, uint16_t, uint16_t, SRC, PAD);
 FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL);
 FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER);
 FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE);
 FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD);
 FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL);
 
+/* Use more unrolling for src_0565_0565 because it is typically CPU bound */
+static force_inline void
+scaled_nearest_scanline_565_565_SRC (uint16_t *      dst,
+                                    uint16_t *      src,
+                                    int32_t         w,
+                                    pixman_fixed_t  vx,
+                                    pixman_fixed_t  unit_x,
+                                    pixman_fixed_t  max_vx)
+{
+    uint16_t tmp1, tmp2, tmp3, tmp4;
+    while ((w -= 4) >= 0)
+    {
+       tmp1 = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+       tmp2 = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+       tmp3 = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+       tmp4 = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+       *dst++ = tmp1;
+       *dst++ = tmp2;
+       *dst++ = tmp3;
+       *dst++ = tmp4;
+    }
+    if (w & 2)
+    {
+       tmp1 = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+       tmp2 = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+       *dst++ = tmp1;
+       *dst++ = tmp2;
+    }
+    if (w & 1)
+       *dst++ = src[pixman_fixed_to_int (vx)];
+}
+
+FAST_NEAREST_MAINLOOP (565_565_cover_SRC,
+                      scaled_nearest_scanline_565_565_SRC,
+                      uint16_t, uint16_t, COVER);
+FAST_NEAREST_MAINLOOP (565_565_none_SRC,
+                      scaled_nearest_scanline_565_565_SRC,
+                      uint16_t, uint16_t, NONE);
+FAST_NEAREST_MAINLOOP (565_565_pad_SRC,
+                      scaled_nearest_scanline_565_565_SRC,
+                      uint16_t, uint16_t, PAD);
+
 static force_inline uint32_t
 fetch_nearest (pixman_repeat_t src_repeat,
               pixman_format_code_t format,