vmx: implement fast path scaled nearest vmx_8888_8888_OVER
authorOded Gabbay <oded.gabbay@gmail.com>
Sun, 28 Jun 2015 20:25:24 +0000 (23:25 +0300)
committerOded Gabbay <oded.gabbay@gmail.com>
Thu, 16 Jul 2015 13:13:35 +0000 (16:13 +0300)
It was benchmarked against commid id 2be523b from pixman/master

POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le.
reference memcpy speed = 24764.8MB/s (6191.2MP/s for 32bpp fills)

                Before           After           Change
              ---------------------------------------------
L1              134.36          181.68          +35.22%
L2              135.07          180.67          +33.76%
M               134.6           180.51          +34.11%
HT              121.77          128.79          +5.76%
VT              120.49          145.07          +20.40%
R               93.83           102.3           +9.03%
RT              50.82           46.93           -7.65%
Kops/s          448             422             -5.80%

cairo trimmed benchmarks :

Speedups
========
t-firefox-asteroids  533.92 -> 497.92 :  1.07x
    t-midori-zoomed  692.98 -> 651.24 :  1.06x

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Acked-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
pixman/pixman-vmx.c

index 64e9125..0950850 100644 (file)
@@ -2954,6 +2954,129 @@ vmx_composite_add_8888_8888 (pixman_implementation_t *imp,
     }
 }
 
+static force_inline void
+scaled_nearest_scanline_vmx_8888_8888_OVER (uint32_t*       pd,
+                                            const uint32_t* ps,
+                                            int32_t         w,
+                                            pixman_fixed_t  vx,
+                                            pixman_fixed_t  unit_x,
+                                            pixman_fixed_t  src_width_fixed,
+                                            pixman_bool_t   fully_transparent_src)
+{
+    uint32_t s, d;
+    const uint32_t* pm = NULL;
+
+    vector unsigned int vmx_dst_lo, vmx_dst_hi;
+    vector unsigned int vmx_src_lo, vmx_src_hi;
+    vector unsigned int vmx_alpha_lo, vmx_alpha_hi;
+
+    if (fully_transparent_src)
+       return;
+
+    /* Align dst on a 16-byte boundary */
+    while (w && ((uintptr_t)pd & 15))
+    {
+       d = *pd;
+       s = combine1 (ps + pixman_fixed_to_int (vx), pm);
+       vx += unit_x;
+       while (vx >= 0)
+           vx -= src_width_fixed;
+
+       *pd++ = core_combine_over_u_pixel_vmx (s, d);
+       if (pm)
+           pm++;
+       w--;
+    }
+
+    while (w >= 4)
+    {
+       vector unsigned int tmp;
+       uint32_t tmp1, tmp2, tmp3, tmp4;
+
+       tmp1 = *(ps + pixman_fixed_to_int (vx));
+       vx += unit_x;
+       while (vx >= 0)
+           vx -= src_width_fixed;
+       tmp2 = *(ps + pixman_fixed_to_int (vx));
+       vx += unit_x;
+       while (vx >= 0)
+           vx -= src_width_fixed;
+       tmp3 = *(ps + pixman_fixed_to_int (vx));
+       vx += unit_x;
+       while (vx >= 0)
+           vx -= src_width_fixed;
+       tmp4 = *(ps + pixman_fixed_to_int (vx));
+       vx += unit_x;
+       while (vx >= 0)
+           vx -= src_width_fixed;
+
+       tmp[0] = tmp1;
+       tmp[1] = tmp2;
+       tmp[2] = tmp3;
+       tmp[3] = tmp4;
+
+       vmx_src_hi = combine4 ((const uint32_t *) &tmp, pm);
+
+       if (is_opaque (vmx_src_hi))
+       {
+           save_128_aligned (pd, vmx_src_hi);
+       }
+       else if (!is_zero (vmx_src_hi))
+       {
+           vmx_dst_hi = load_128_aligned (pd);
+
+           unpack_128_2x128 (vmx_src_hi, (vector unsigned int) AVV(0),
+                               &vmx_src_lo, &vmx_src_hi);
+
+           unpack_128_2x128 (vmx_dst_hi, (vector unsigned int) AVV(0),
+                               &vmx_dst_lo, &vmx_dst_hi);
+
+           expand_alpha_2x128 (
+               vmx_src_lo, vmx_src_hi, &vmx_alpha_lo, &vmx_alpha_hi);
+
+           over_2x128 (&vmx_src_lo, &vmx_src_hi,
+                       &vmx_alpha_lo, &vmx_alpha_hi,
+                       &vmx_dst_lo, &vmx_dst_hi);
+
+           /* rebuid the 4 pixel data and save*/
+           save_128_aligned (pd, pack_2x128_128 (vmx_dst_lo, vmx_dst_hi));
+       }
+
+       w -= 4;
+       pd += 4;
+       if (pm)
+           pm += 4;
+    }
+
+    while (w)
+    {
+       d = *pd;
+       s = combine1 (ps + pixman_fixed_to_int (vx), pm);
+       vx += unit_x;
+       while (vx >= 0)
+           vx -= src_width_fixed;
+
+       *pd++ = core_combine_over_u_pixel_vmx (s, d);
+       if (pm)
+           pm++;
+
+       w--;
+    }
+}
+
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_cover_OVER,
+                      scaled_nearest_scanline_vmx_8888_8888_OVER,
+                      uint32_t, uint32_t, COVER)
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_none_OVER,
+                      scaled_nearest_scanline_vmx_8888_8888_OVER,
+                      uint32_t, uint32_t, NONE)
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_pad_OVER,
+                      scaled_nearest_scanline_vmx_8888_8888_OVER,
+                      uint32_t, uint32_t, PAD)
+FAST_NEAREST_MAINLOOP (vmx_8888_8888_normal_OVER,
+                      scaled_nearest_scanline_vmx_8888_8888_OVER,
+                      uint32_t, uint32_t, NORMAL)
+
 static const pixman_fast_path_t vmx_fast_paths[] =
 {
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, vmx_composite_over_8888_8888),
@@ -2974,6 +3097,11 @@ static const pixman_fast_path_t vmx_fast_paths[] =
     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, vmx_composite_src_x888_8888),
     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, vmx_composite_src_x888_8888),
 
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, vmx_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, vmx_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, vmx_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, vmx_8888_8888),
+
     {   PIXMAN_OP_NONE },
 };