From 6682b2b3597c9f431900bfe7b1b42dfbe006bae5 Mon Sep 17 00:00:00 2001
From: Taekyun Kim <tkq.kim@samsung.com>
Date: Tue, 20 Sep 2011 21:32:35 +0900
Subject: [PATCH] ARM: NEON: Bilinear macro template for instruction scheduling

This macro template takes 6 code blocks.

1. process_last_pixel
2. process_two_pixels
3. process_four_pixels
4. process_pixblock_head
5. process_pixblock_tail
6. process_pixblock_tail_head

process_last_pixel does not need to update horizontal weight. This
is done by the template. two and four code block should update
horizontal weight inside of them. head/tail/tail_head blocks
consist unrolled core loop. You can apply instruction scheduling
to the tail_head blocks.

You can also specify size of the pixel block. Supported size is 4
and 8. If you want to use mask, give BILINEAR_FLAG_USE_MASK flags
to the template, then you can use register MASK. When using d8~d15
registers, give BILINEAR_FLAG_USE_ALL_NEON_REGS to make sure
registers are properly saved on the stack and later restored.
---
 pixman/pixman-arm-neon-asm-bilinear.S | 195 ++++++++++++++++++++++++++++++++++
 1 file changed, 195 insertions(+)

diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index c5ba929..784e5df 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -773,3 +773,198 @@ generate_bilinear_scanline_func_src_a8_dst \
 generate_bilinear_scanline_func_src_a8_dst \
     pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
     8888, 8888, add, 2, 28
+
+.set BILINEAR_FLAG_USE_MASK,		1
+.set BILINEAR_FLAG_USE_ALL_NEON_REGS,	2
+
+/*
+ * Main template macro for generating NEON optimized bilinear scanline functions.
+ *
+ * Bilinear scanline generator macro take folling arguments:
+ *  fname			- name of the function to generate
+ *  src_fmt			- source color format (8888 or 0565)
+ *  dst_fmt			- destination color format (8888 or 0565)
+ *  src/dst_bpp_shift		- (1 << bpp_shift) is the size of src/dst pixel in bytes
+ *  process_last_pixel		- code block that interpolate one pixel and does not
+ *				  update horizontal weight
+ *  process_two_pixels		- code block that interpolate two pixels and update
+ *				  horizontal weight
+ *  process_four_pixels		- code block that interpolate four pixels and update
+ *				  horizontal weight
+ *  process_pixblock_head	- head part of middle loop
+ *  process_pixblock_tail	- tail part of middle loop
+ *  process_pixblock_tail_head	- tail_head of middle loop
+ *  pixblock_size		- number of pixels processed in a single middle loop
+ *  prefetch_distance		- prefetch in the source image by that many pixels ahead
+ */
+
+.macro generate_bilinear_scanline_func \
+	fname, \
+	src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \
+	bilinear_process_last_pixel, \
+	bilinear_process_two_pixels, \
+	bilinear_process_four_pixels, \
+	bilinear_process_pixblock_head, \
+	bilinear_process_pixblock_tail, \
+	bilinear_process_pixblock_tail_head, \
+	pixblock_size, \
+	prefetch_distance, \
+	flags
+
+pixman_asm_function fname
+.if pixblock_size == 8
+.elseif pixblock_size == 4
+.else
+    .error unsupported pixblock size
+.endif
+
+.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
+    OUT       .req    r0
+    TOP       .req    r1
+    BOTTOM    .req    r2
+    WT        .req    r3
+    WB        .req    r4
+    X         .req    r5
+    UX        .req    r6
+    WIDTH     .req    ip
+    TMP1      .req    r3
+    TMP2      .req    r4
+    PF_OFFS   .req    r7
+    TMP3      .req    r8
+    TMP4      .req    r9
+    STRIDE    .req    r2
+
+    mov		ip, sp
+    push	{r4, r5, r6, r7, r8, r9}
+    mov		PF_OFFS, #prefetch_distance
+    ldmia	ip, {WB, X, UX, WIDTH}
+.else
+    OUT       .req      r0
+    MASK      .req      r1
+    TOP       .req      r2
+    BOTTOM    .req      r3
+    WT        .req      r4
+    WB        .req      r5
+    X         .req      r6
+    UX        .req      r7
+    WIDTH     .req      ip
+    TMP1      .req      r4
+    TMP2      .req      r5
+    PF_OFFS   .req      r8
+    TMP3      .req      r9
+    TMP4      .req      r10
+    STRIDE    .req      r3
+
+    mov       ip, sp
+    push      {r4, r5, r6, r7, r8, r9, r10, ip}
+    mov       PF_OFFS, #prefetch_distance
+    ldmia     ip, {WT, WB, X, UX, WIDTH}
+.endif
+
+    mul       PF_OFFS, PF_OFFS, UX
+
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+    vpush     {d8-d15}
+.endif
+
+    sub	      STRIDE, BOTTOM, TOP
+    .unreq    BOTTOM
+
+    cmp       WIDTH, #0
+    ble       3f
+
+    vdup.u16  q12, X
+    vdup.u16  q13, UX
+    vdup.u8   d28, WT
+    vdup.u8   d29, WB
+    vadd.u16  d25, d25, d26
+
+    /* ensure good destination alignment  */
+    cmp       WIDTH, #1
+    blt       0f
+    tst       OUT, #(1 << dst_bpp_shift)
+    beq       0f
+    vshr.u16  q15, q12, #8
+    vadd.u16  q12, q12, q13
+    bilinear_process_last_pixel
+    sub       WIDTH, WIDTH, #1
+0:
+    vadd.u16  q13, q13, q13
+    vshr.u16  q15, q12, #8
+    vadd.u16  q12, q12, q13
+
+    cmp       WIDTH, #2
+    blt       0f
+    tst       OUT, #(1 << (dst_bpp_shift + 1))
+    beq       0f
+    bilinear_process_two_pixels
+    sub       WIDTH, WIDTH, #2
+0:
+.if pixblock_size == 8
+    cmp       WIDTH, #4
+    blt       0f
+    tst       OUT, #(1 << (dst_bpp_shift + 2))
+    beq       0f
+    bilinear_process_four_pixels
+    sub       WIDTH, WIDTH, #4
+0:
+.endif
+    subs      WIDTH, WIDTH, #pixblock_size
+    blt       1f
+    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
+    bilinear_process_pixblock_head
+    subs      WIDTH, WIDTH, #pixblock_size
+    blt       5f
+0:
+    bilinear_process_pixblock_tail_head
+    subs      WIDTH, WIDTH, #pixblock_size
+    bge       0b
+5:
+    bilinear_process_pixblock_tail
+1:
+.if pixblock_size == 8
+    tst       WIDTH, #4
+    beq       2f
+    bilinear_process_four_pixels
+2:
+.endif
+    /* handle the remaining trailing pixels */
+    tst       WIDTH, #2
+    beq       2f
+    bilinear_process_two_pixels
+2:
+    tst       WIDTH, #1
+    beq       3f
+    bilinear_process_last_pixel
+3:
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+    vpop      {d8-d15}
+.endif
+
+.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
+    pop       {r4, r5, r6, r7, r8, r9}
+.else
+    pop       {r4, r5, r6, r7, r8, r9, r10, ip}
+.endif
+    bx        lr
+
+    .unreq    OUT
+    .unreq    TOP
+    .unreq    WT
+    .unreq    WB
+    .unreq    X
+    .unreq    UX
+    .unreq    WIDTH
+    .unreq    TMP1
+    .unreq    TMP2
+    .unreq    PF_OFFS
+    .unreq    TMP3
+    .unreq    TMP4
+    .unreq    STRIDE
+.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
+    .unreq    MASK
+.endif
+
+.endfunc
+
+.endm
-- 
2.7.4