From 3990931bf6197eff1cec06cf24bce53ddf9a539a Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Sat, 27 Nov 2010 04:47:39 +0200
Subject: [PATCH] ARM: reuse common NEON code for over_{n_8|8888_n|8888_8}_0565

Renamed suppementary macros from 'over_n_8_0565' to 'over_8888_8_0565',
because they can actually support all variants of this operation:
over_8888_8_0565/over_n_8_0565/over_8888_n_0565.

Also 'over_8888_8_0565' now uses more optimized common code instead of its
own variant, improving performance a bit. Even though this operation is
still memory bandwidth limited, scaled variants of these fast paths may
put more stress on CPU later.

Benchmarked on ARM Cortex-A8 @500MHz:

== before ==

    over_8888_8_0565 =  L1:  67.10  L2:  53.82  M: 44.70 (105.17%)
                        HT:  18.73  VT:  16.91  R: 14.25  RT:  4.80 (52Kops/s)

== after ==

    over_8888_8_0565 =  L1:  77.83  L2:  58.14  M: 44.82 (105.52%)
                        HT:  20.58  VT:  17.44  R: 15.05  RT:  4.88 (52Kops/s)
---
 pixman/pixman-arm-neon-asm.S | 61 ++++++++++++++++++--------------------------
 1 file changed, 25 insertions(+), 36 deletions(-)

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 3e52a49..4175144 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -791,7 +791,7 @@ generate_composite_function \
 
 /******************************************************************************/
 
-.macro pixman_composite_over_n_8_0565_process_pixblock_head
+.macro pixman_composite_over_8888_8_0565_process_pixblock_head
     vmull.u8    q0,  d24, d8    /* IN for SRC pixels (part1) */
     vmull.u8    q1,  d24, d9
     vmull.u8    q6,  d24, d10
@@ -816,7 +816,7 @@ generate_composite_function \
     vmull.u8    q10, d3, d30
 .endm
 
-.macro pixman_composite_over_n_8_0565_process_pixblock_tail
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
     /* 3 cycle bubble (after vmull.u8) */
     vrshr.u16   q13, q8,  #8
     vrshr.u16   q11, q9,  #8
@@ -835,7 +835,7 @@ generate_composite_function \
     vsri.u16    q14, q9,  #11
 .endm
 
-.macro pixman_composite_over_n_8_0565_process_pixblock_tail_head
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
     vld1.16     {d4, d5}, [DST_R, :128]!
     vshrn.u16   d6,  q2,  #8
     fetch_mask_pixblock
@@ -880,6 +880,23 @@ generate_composite_function \
     vmull.u8    q10, d3,  d30
 .endm
 
+generate_composite_function \
+    pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
 /*
  * This function needs a special initialization of solid mask.
  * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
@@ -911,9 +928,9 @@ generate_composite_function \
     5, /* prefetch distance */ \
     pixman_composite_over_n_8_0565_init, \
     pixman_composite_over_n_8_0565_cleanup, \
-    pixman_composite_over_n_8_0565_process_pixblock_head, \
-    pixman_composite_over_n_8_0565_process_pixblock_tail, \
-    pixman_composite_over_n_8_0565_process_pixblock_tail_head
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head
 
 /******************************************************************************/
 
@@ -935,36 +952,8 @@ generate_composite_function \
     5, /* prefetch distance */ \
     pixman_composite_over_8888_n_0565_init, \
     pixman_composite_over_8888_n_0565_cleanup, \
-    pixman_composite_over_n_8_0565_process_pixblock_head, \
-    pixman_composite_over_n_8_0565_process_pixblock_tail, \
-    pixman_composite_over_n_8_0565_process_pixblock_tail_head, \
-    28, /* dst_w_basereg */ \
-    4,  /* dst_r_basereg */ \
-    8,  /* src_basereg   */ \
-    24  /* mask_basereg  */
-
-/******************************************************************************/
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
-    vld1.16     {d4, d5}, [DST_R, :128]!
-    pixman_composite_over_n_8_0565_process_pixblock_tail
-    fetch_src_pixblock
-    cache_preload 8, 8
-    fetch_mask_pixblock
-    pixman_composite_over_n_8_0565_process_pixblock_head
-    vst1.16     {d28, d29}, [DST_W, :128]!
-.endm
-
-generate_composite_function \
-    pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
-    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
-    8, /* number of pixels, processed in a single block */ \
-    5, /* prefetch distance */ \
-    default_init_need_all_regs, \
-    default_cleanup_need_all_regs, \
-    pixman_composite_over_n_8_0565_process_pixblock_head, \
-    pixman_composite_over_n_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
     pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
     28, /* dst_w_basereg */ \
     4,  /* dst_r_basereg */ \
-- 
2.7.4