From 88014a0e6ffaa22b3ac363c2c73b72530cdba0cc Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Mon, 1 Nov 2010 10:03:59 +0200
Subject: [PATCH] ARM: nearest scaling support for NEON scanline compositing
 functions

Now it is possible to generate scanline processing functions
for the case when the source image is scaled with NEAREST filter.

Only 16bpp and 32bpp pixel formats are supported for now. But the
others can be also added later when needed. All the existing NEON
fast path functions should be quite easy to reuse for implementing
fast paths which can work with scaled source images.
---
 pixman/pixman-arm-neon-asm.h | 176 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 163 insertions(+), 13 deletions(-)

diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index aa5e9bd..d3b506d 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -205,6 +205,100 @@
 .endif
 .endm
 
+/*
+ * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
+ * aliases to be defined)
+ */
+.macro pixld1_s elem_size, reg1, mem_operand
+.if elem_size == 16
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP1, mem_operand, TMP1, asl #1
+    mov     TMP2, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #1
+    vld1.16 {d&reg1&[0]}, [TMP1, :16]
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP1, mem_operand, TMP1, asl #1
+    vld1.16 {d&reg1&[1]}, [TMP2, :16]
+    mov     TMP2, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #1
+    vld1.16 {d&reg1&[2]}, [TMP1, :16]
+    vld1.16 {d&reg1&[3]}, [TMP2, :16]
+.elseif elem_size == 32
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP1, mem_operand, TMP1, asl #2
+    mov     TMP2, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #2
+    vld1.32 {d&reg1&[0]}, [TMP1, :32]
+    vld1.32 {d&reg1&[1]}, [TMP2, :32]
+.else
+    .error "unsupported"
+.endif
+.endm
+
+.macro pixld0_s elem_size, reg1, idx, mem_operand
+.if elem_size == 16
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP1, mem_operand, TMP1, asl #1
+    vld1.16 {d&reg1&[idx]}, [TMP1, :16]
+.elseif elem_size == 32
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP1, mem_operand, TMP1, asl #2
+    vld1.32 {d&reg1&[idx]}, [TMP1, :32]
+.endif
+.endm
+
+.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
+.if numbytes == 32
+    pixld1_s elem_size, %(basereg+4), mem_operand
+    pixld1_s elem_size, %(basereg+5), mem_operand
+    pixld1_s elem_size, %(basereg+6), mem_operand
+    pixld1_s elem_size, %(basereg+7), mem_operand
+    pixdeinterleave elem_size, %(basereg+4)
+.elseif numbytes == 16
+    pixld1_s elem_size, %(basereg+2), mem_operand
+    pixld1_s elem_size, %(basereg+3), mem_operand
+.elseif numbytes == 8
+    pixld1_s elem_size, %(basereg+1), mem_operand
+.elseif numbytes == 4
+    .if elem_size == 32
+        pixld0_s elem_size, %(basereg+0), 1, mem_operand
+    .elseif elem_size == 16
+        pixld0_s elem_size, %(basereg+0), 2, mem_operand
+        pixld0_s elem_size, %(basereg+0), 3, mem_operand
+    .else
+        pixld0_s elem_size, %(basereg+0), 4, mem_operand
+        pixld0_s elem_size, %(basereg+0), 5, mem_operand
+        pixld0_s elem_size, %(basereg+0), 6, mem_operand
+        pixld0_s elem_size, %(basereg+0), 7, mem_operand
+    .endif
+.elseif numbytes == 2
+    .if elem_size == 16
+        pixld0_s elem_size, %(basereg+0), 1, mem_operand
+    .else
+        pixld0_s elem_size, %(basereg+0), 2, mem_operand
+        pixld0_s elem_size, %(basereg+0), 3, mem_operand
+    .endif
+.elseif numbytes == 1
+    pixld0_s elem_size, %(basereg+0), 1, mem_operand
+.else
+    .error "unsupported size: numbytes"
+.endif
+.endm
+
+.macro pixld_s numpix, bpp, basereg, mem_operand
+.if bpp > 0
+    pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
+.endif
+.endm
+
 .macro vuzp8 reg1, reg2
     vuzp.8 d&reg1, d&reg2
 .endm
@@ -792,7 +886,8 @@ fname:
  * A simplified variant of function generation template for a single
  * scanline processing (for implementing pixman combine functions)
  */
-.macro generate_composite_function_single_scanline fname, \
+.macro generate_composite_function_scanline        use_nearest_scaling, \
+                                                   fname, \
                                                    src_bpp_, \
                                                    mask_bpp_, \
                                                    dst_w_bpp_, \
@@ -830,23 +925,44 @@ fname:
     .set src_basereg, src_basereg_
     .set mask_basereg, mask_basereg_
 
+.if use_nearest_scaling != 0
+    /*
+     * Assign symbolic names to registers for nearest scaling
+     */
+    W           .req        r0
+    DST_W       .req        r1
+    SRC         .req        r2
+    VX          .req        r3
+    UNIT_X      .req        ip
+    MASK        .req        lr
+    TMP1        .req        r4
+    TMP2        .req        r5
+    DST_R       .req        r6
+
     .macro pixld_src x:vararg
-        pixld x
-    .endm
-    .macro fetch_src_pixblock
-        pixld_src   pixblock_size, src_bpp, \
-                    (src_basereg - pixblock_size * src_bpp / 64), SRC
+        pixld_s x
     .endm
 
-/*
- * Assign symbolic names to registers
- */
+    ldr         UNIT_X, [sp]
+    push        {r4-r6, lr}
+    .if mask_bpp != 0
+    ldr         MASK, [sp, #(16 + 4)]
+    .endif
+.else
+    /*
+     * Assign symbolic names to registers
+     */
     W           .req        r0      /* width (is updated during processing) */
     DST_W       .req        r1      /* destination buffer pointer for writes */
     SRC         .req        r2      /* source buffer pointer */
     DST_R       .req        ip      /* destination buffer pointer for reads */
     MASK        .req        r3      /* mask pointer */
 
+    .macro pixld_src x:vararg
+        pixld x
+    .endm
+.endif
+
 .if (((flags) & FLAG_DST_READWRITE) != 0)
     .set dst_r_bpp, dst_w_bpp
 .else
@@ -858,6 +974,11 @@ fname:
     .set DEINTERLEAVE_32BPP_ENABLED, 0
 .endif
 
+    .macro fetch_src_pixblock
+        pixld_src   pixblock_size, src_bpp, \
+                    (src_basereg - pixblock_size * src_bpp / 64), SRC
+    .endm
+
     init
     mov         DST_R, DST_W
 
@@ -896,7 +1017,11 @@ fname:
                             process_pixblock_tail_head
 
     cleanup
-    bx         lr  /* exit */
+.if use_nearest_scaling != 0
+    pop         {r4-r6, pc}  /* exit */
+.else
+    bx          lr  /* exit */
+.endif
 8:
     /* Process the remaining trailing pixels in the scanline (dst unaligned) */
     process_trailing_pixels 0, 0, \
@@ -905,19 +1030,44 @@ fname:
                             process_pixblock_tail_head
 
     cleanup
-    bx          lr  /* exit */
 
-    .purgem     fetch_src_pixblock
-    .purgem     pixld_src
+.if use_nearest_scaling != 0
+    pop         {r4-r6, pc}  /* exit */
+
+    .unreq      DST_R
+    .unreq      SRC
+    .unreq      W
+    .unreq      VX
+    .unreq      UNIT_X
+    .unreq      TMP1
+    .unreq      TMP2
+    .unreq      DST_W
+    .unreq      MASK
+
+.else
+    bx          lr  /* exit */
 
     .unreq      SRC
     .unreq      MASK
     .unreq      DST_R
     .unreq      DST_W
     .unreq      W
+.endif
+
+    .purgem     fetch_src_pixblock
+    .purgem     pixld_src
+
     .endfunc
 .endm
 
+.macro generate_composite_function_single_scanline x:vararg
+    generate_composite_function_scanline 0, x
+.endm
+
+.macro generate_composite_function_nearest_scanline x:vararg
+    generate_composite_function_scanline 1, x
+.endm
+
 /* Default prologue/epilogue, nothing special needs to be done */
 
 .macro default_init
-- 
2.7.4