From: Siarhei Siamashka Date: Tue, 2 Nov 2010 17:16:46 +0000 (+0200) Subject: ARM: performance tuning of NEON nearest scaled pixel fetcher X-Git-Tag: pixman-0.21.2~10 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=0b56244ac81f2bb2402629f8720c7e22893a24df;p=platform%2Fupstream%2Fpixman.git ARM: performance tuning of NEON nearest scaled pixel fetcher Interleaving the use of NEON registers helps to avoid some stalls in NEON pipeline and provides a small performance improvement. --- diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h index d3b506d..c75bdc3 100644 --- a/pixman/pixman-arm-neon-asm.h +++ b/pixman/pixman-arm-neon-asm.h @@ -241,6 +241,30 @@ .endif .endm +.macro pixld2_s elem_size, reg1, reg2, mem_operand +.if elem_size == 32 + mov TMP1, VX, asr #16 + add VX, VX, UNIT_X, asl #1 + add TMP1, mem_operand, TMP1, asl #2 + mov TMP2, VX, asr #16 + sub VX, VX, UNIT_X + add TMP2, mem_operand, TMP2, asl #2 + vld1.32 {d®1&[0]}, [TMP1, :32] + mov TMP1, VX, asr #16 + add VX, VX, UNIT_X, asl #1 + add TMP1, mem_operand, TMP1, asl #2 + vld1.32 {d®2&[0]}, [TMP2, :32] + mov TMP2, VX, asr #16 + add VX, VX, UNIT_X + add TMP2, mem_operand, TMP2, asl #2 + vld1.32 {d®1&[1]}, [TMP1, :32] + vld1.32 {d®2&[1]}, [TMP2, :32] +.else + pixld1_s elem_size, reg1, mem_operand + pixld1_s elem_size, reg2, mem_operand +.endif +.endm + .macro pixld0_s elem_size, reg1, idx, mem_operand .if elem_size == 16 mov TMP1, VX, asr #16 @@ -257,14 +281,11 @@ .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand .if numbytes == 32 - pixld1_s elem_size, %(basereg+4), mem_operand - pixld1_s elem_size, %(basereg+5), mem_operand - pixld1_s elem_size, %(basereg+6), mem_operand - pixld1_s elem_size, %(basereg+7), mem_operand + pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand + pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand pixdeinterleave elem_size, %(basereg+4) .elseif numbytes == 16 - pixld1_s elem_size, %(basereg+2), mem_operand - pixld1_s elem_size, %(basereg+3), mem_operand + pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand .elseif numbytes == 8 pixld1_s elem_size, %(basereg+1), mem_operand .elseif numbytes == 4