ARM: optimization for scaled src_0565_0565 with nearest filter
authorSiarhei Siamashka <siarhei.siamashka@nokia.com>
Sun, 3 Oct 2010 22:56:59 +0000 (01:56 +0300)
committerSiarhei Siamashka <siarhei.siamashka@nokia.com>
Wed, 10 Nov 2010 15:26:49 +0000 (17:26 +0200)
The performance improvement is only in the ballpark of 5% when
compared against C code built with a reasonably good compiler
(gcc 4.5.1). But gcc 4.4 produces approximately 30% slower code
here, so assembly optimization makes sense to avoid dependency
on the compiler quality and/or optimization options.

Benchmark from ARM11:
    == before ==
    op=1, src_fmt=10020565, dst_fmt=10020565, speed=34.86 MPix/s

    == after ==
    op=1, src_fmt=10020565, dst_fmt=10020565, speed=36.62 MPix/s

Benchmark from ARM Cortex-A8:
    == before ==
    op=1, src_fmt=10020565, dst_fmt=10020565, speed=89.55 MPix/s

    == after ==
    op=1, src_fmt=10020565, dst_fmt=10020565, speed=94.91 MPix/s

pixman/pixman-arm-simd-asm.S
pixman/pixman-arm-simd.c

index a3d2d40..7567700 100644 (file)
@@ -1,5 +1,6 @@
 /*
  * Copyright © 2008 Mozilla Corporation
+ * Copyright © 2010 Nokia Corporation
  *
  * Permission to use, copy, modify, distribute, and sell this software and its
  * documentation for any purpose is hereby granted without fee, provided that
@@ -328,3 +329,72 @@ pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
        pop     {r4, r5, r6, r7, r8, r9, r10, r11}
        bx      lr
 .endfunc
+
+/*
+ * Note: This function is only using armv4t instructions (not even armv6),
+ *       but is scheduled for ARM Cortex-A8 pipeline. So it might need to
+ *       be split into a few variants, tuned for each microarchitecture.
+ *
+ * TODO: In order to get good performance on ARM9/ARM11 cores (which don't
+ * have efficient write combining), it needs to be changed to use 16-byte
+ * aligned writes using STM instruction.
+ */
+pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
+       W       .req    r0
+       DST     .req    r1
+       SRC     .req    r2
+       VX      .req    r3
+       UNIT_X  .req    ip
+       TMP1    .req    r4
+       TMP2    .req    r5
+       VXMASK  .req    r6
+
+       ldr     UNIT_X, [sp]
+       push    {r4, r5, r6, r7}
+       mvn     VXMASK, #1
+
+       /* define helper macro */
+       .macro  scale_2_pixels
+               ldrh    TMP1, [SRC, TMP1]
+               and     TMP2, VXMASK, VX, lsr #15
+               add     VX, VX, UNIT_X
+               strh    TMP1, [DST], #2
+
+               ldrh    TMP2, [SRC, TMP2]
+               and     TMP1, VXMASK, VX, lsr #15
+               add     VX, VX, UNIT_X
+               strh    TMP2, [DST], #2
+       .endm
+
+       /* now do the scaling */
+       and     TMP1, VXMASK, VX, lsr #15
+       add     VX, VX, UNIT_X
+       subs    W, #4
+       blt     2f
+1: /* main loop, process 4 pixels per iteration */
+       scale_2_pixels
+       scale_2_pixels
+       subs    W, W, #4
+       bge     1b
+2:
+       tst     W, #2
+       beq     2f
+       scale_2_pixels
+2:
+       tst     W, #1
+       ldrneh  TMP1, [SRC, TMP1]
+       strneh  TMP1, [DST], #2
+       /* cleanup helper macro */
+       .purgem scale_2_pixels
+       .unreq  DST
+       .unreq  SRC
+       .unreq  W
+       .unreq  VX
+       .unreq  UNIT_X
+       .unreq  TMP1
+       .unreq  TMP2
+       .unreq  VXMASK
+       /* return */
+       pop     {r4, r5, r6, r7}
+       bx      lr
+.endfunc
index d466a31..3b05007 100644 (file)
@@ -29,6 +29,7 @@
 
 #include "pixman-private.h"
 #include "pixman-arm-common.h"
+#include "pixman-fast-path.h"
 
 #if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */
 
@@ -386,6 +387,9 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (armv6, over_8888_n_8888,
 PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (armv6, over_n_8_8888,
                                       uint8_t, 1, uint32_t, 1)
 
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
+                                        uint16_t, uint16_t)
+
 static const pixman_fast_path_t arm_simd_fast_paths[] =
 {
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
@@ -404,6 +408,9 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
 
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
+
     { PIXMAN_OP_NONE },
 };