From 88014a0e6ffaa22b3ac363c2c73b72530cdba0cc Mon Sep 17 00:00:00 2001 From: Siarhei Siamashka Date: Mon, 1 Nov 2010 10:03:59 +0200 Subject: [PATCH] ARM: nearest scaling support for NEON scanline compositing functions Now it is possible to generate scanline processing functions for the case when the source image is scaled with NEAREST filter. Only 16bpp and 32bpp pixel formats are supported for now. But the others can be also added later when needed. All the existing NEON fast path functions should be quite easy to reuse for implementing fast paths which can work with scaled source images. --- pixman/pixman-arm-neon-asm.h | 176 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 163 insertions(+), 13 deletions(-) diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h index aa5e9bd..d3b506d 100644 --- a/pixman/pixman-arm-neon-asm.h +++ b/pixman/pixman-arm-neon-asm.h @@ -205,6 +205,100 @@ .endif .endm +/* + * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register + * aliases to be defined) + */ +.macro pixld1_s elem_size, reg1, mem_operand +.if elem_size == 16 + mov TMP1, VX, asr #16 + add VX, VX, UNIT_X + add TMP1, mem_operand, TMP1, asl #1 + mov TMP2, VX, asr #16 + add VX, VX, UNIT_X + add TMP2, mem_operand, TMP2, asl #1 + vld1.16 {d®1&[0]}, [TMP1, :16] + mov TMP1, VX, asr #16 + add VX, VX, UNIT_X + add TMP1, mem_operand, TMP1, asl #1 + vld1.16 {d®1&[1]}, [TMP2, :16] + mov TMP2, VX, asr #16 + add VX, VX, UNIT_X + add TMP2, mem_operand, TMP2, asl #1 + vld1.16 {d®1&[2]}, [TMP1, :16] + vld1.16 {d®1&[3]}, [TMP2, :16] +.elseif elem_size == 32 + mov TMP1, VX, asr #16 + add VX, VX, UNIT_X + add TMP1, mem_operand, TMP1, asl #2 + mov TMP2, VX, asr #16 + add VX, VX, UNIT_X + add TMP2, mem_operand, TMP2, asl #2 + vld1.32 {d®1&[0]}, [TMP1, :32] + vld1.32 {d®1&[1]}, [TMP2, :32] +.else + .error "unsupported" +.endif +.endm + +.macro pixld0_s elem_size, reg1, idx, mem_operand +.if elem_size == 16 + mov TMP1, VX, asr #16 + add VX, VX, UNIT_X + add TMP1, mem_operand, TMP1, asl #1 + vld1.16 {d®1&[idx]}, [TMP1, :16] +.elseif elem_size == 32 + mov TMP1, VX, asr #16 + add VX, VX, UNIT_X + add TMP1, mem_operand, TMP1, asl #2 + vld1.32 {d®1&[idx]}, [TMP1, :32] +.endif +.endm + +.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand +.if numbytes == 32 + pixld1_s elem_size, %(basereg+4), mem_operand + pixld1_s elem_size, %(basereg+5), mem_operand + pixld1_s elem_size, %(basereg+6), mem_operand + pixld1_s elem_size, %(basereg+7), mem_operand + pixdeinterleave elem_size, %(basereg+4) +.elseif numbytes == 16 + pixld1_s elem_size, %(basereg+2), mem_operand + pixld1_s elem_size, %(basereg+3), mem_operand +.elseif numbytes == 8 + pixld1_s elem_size, %(basereg+1), mem_operand +.elseif numbytes == 4 + .if elem_size == 32 + pixld0_s elem_size, %(basereg+0), 1, mem_operand + .elseif elem_size == 16 + pixld0_s elem_size, %(basereg+0), 2, mem_operand + pixld0_s elem_size, %(basereg+0), 3, mem_operand + .else + pixld0_s elem_size, %(basereg+0), 4, mem_operand + pixld0_s elem_size, %(basereg+0), 5, mem_operand + pixld0_s elem_size, %(basereg+0), 6, mem_operand + pixld0_s elem_size, %(basereg+0), 7, mem_operand + .endif +.elseif numbytes == 2 + .if elem_size == 16 + pixld0_s elem_size, %(basereg+0), 1, mem_operand + .else + pixld0_s elem_size, %(basereg+0), 2, mem_operand + pixld0_s elem_size, %(basereg+0), 3, mem_operand + .endif +.elseif numbytes == 1 + pixld0_s elem_size, %(basereg+0), 1, mem_operand +.else + .error "unsupported size: numbytes" +.endif +.endm + +.macro pixld_s numpix, bpp, basereg, mem_operand +.if bpp > 0 + pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand +.endif +.endm + .macro vuzp8 reg1, reg2 vuzp.8 d®1, d®2 .endm @@ -792,7 +886,8 @@ fname: * A simplified variant of function generation template for a single * scanline processing (for implementing pixman combine functions) */ -.macro generate_composite_function_single_scanline fname, \ +.macro generate_composite_function_scanline use_nearest_scaling, \ + fname, \ src_bpp_, \ mask_bpp_, \ dst_w_bpp_, \ @@ -830,23 +925,44 @@ fname: .set src_basereg, src_basereg_ .set mask_basereg, mask_basereg_ +.if use_nearest_scaling != 0 + /* + * Assign symbolic names to registers for nearest scaling + */ + W .req r0 + DST_W .req r1 + SRC .req r2 + VX .req r3 + UNIT_X .req ip + MASK .req lr + TMP1 .req r4 + TMP2 .req r5 + DST_R .req r6 + .macro pixld_src x:vararg - pixld x - .endm - .macro fetch_src_pixblock - pixld_src pixblock_size, src_bpp, \ - (src_basereg - pixblock_size * src_bpp / 64), SRC + pixld_s x .endm -/* - * Assign symbolic names to registers - */ + ldr UNIT_X, [sp] + push {r4-r6, lr} + .if mask_bpp != 0 + ldr MASK, [sp, #(16 + 4)] + .endif +.else + /* + * Assign symbolic names to registers + */ W .req r0 /* width (is updated during processing) */ DST_W .req r1 /* destination buffer pointer for writes */ SRC .req r2 /* source buffer pointer */ DST_R .req ip /* destination buffer pointer for reads */ MASK .req r3 /* mask pointer */ + .macro pixld_src x:vararg + pixld x + .endm +.endif + .if (((flags) & FLAG_DST_READWRITE) != 0) .set dst_r_bpp, dst_w_bpp .else @@ -858,6 +974,11 @@ fname: .set DEINTERLEAVE_32BPP_ENABLED, 0 .endif + .macro fetch_src_pixblock + pixld_src pixblock_size, src_bpp, \ + (src_basereg - pixblock_size * src_bpp / 64), SRC + .endm + init mov DST_R, DST_W @@ -896,7 +1017,11 @@ fname: process_pixblock_tail_head cleanup - bx lr /* exit */ +.if use_nearest_scaling != 0 + pop {r4-r6, pc} /* exit */ +.else + bx lr /* exit */ +.endif 8: /* Process the remaining trailing pixels in the scanline (dst unaligned) */ process_trailing_pixels 0, 0, \ @@ -905,19 +1030,44 @@ fname: process_pixblock_tail_head cleanup - bx lr /* exit */ - .purgem fetch_src_pixblock - .purgem pixld_src +.if use_nearest_scaling != 0 + pop {r4-r6, pc} /* exit */ + + .unreq DST_R + .unreq SRC + .unreq W + .unreq VX + .unreq UNIT_X + .unreq TMP1 + .unreq TMP2 + .unreq DST_W + .unreq MASK + +.else + bx lr /* exit */ .unreq SRC .unreq MASK .unreq DST_R .unreq DST_W .unreq W +.endif + + .purgem fetch_src_pixblock + .purgem pixld_src + .endfunc .endm +.macro generate_composite_function_single_scanline x:vararg + generate_composite_function_scanline 0, x +.endm + +.macro generate_composite_function_nearest_scanline x:vararg + generate_composite_function_scanline 1, x +.endm + /* Default prologue/epilogue, nothing special needs to be done */ .macro default_init -- 2.7.4