From 98d08b37f17a3379d0ceff8bb7de8f943873fbd8 Mon Sep 17 00:00:00 2001 From: Siarhei Siamashka Date: Fri, 26 Nov 2010 08:55:49 +0200 Subject: [PATCH] ARM: added 'neon_composite_over_n_8_8' fast path --- pixman/pixman-arm-neon-asm.S | 68 ++++++++++++++++++++++++++++++++++++++++++++ pixman/pixman-arm-neon.c | 3 ++ 2 files changed, 71 insertions(+) diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S index 91ec27d..a3875ee 100644 --- a/pixman/pixman-arm-neon-asm.S +++ b/pixman/pixman-arm-neon-asm.S @@ -1203,6 +1203,74 @@ generate_composite_function \ /******************************************************************************/ +.macro pixman_composite_over_n_8_8_process_pixblock_head + vmull.u8 q0, d24, d8 + vmull.u8 q1, d25, d8 + vmull.u8 q6, d26, d8 + vmull.u8 q7, d27, d8 + vrshr.u16 q10, q0, #8 + vrshr.u16 q11, q1, #8 + vrshr.u16 q12, q6, #8 + vrshr.u16 q13, q7, #8 + vraddhn.u16 d0, q0, q10 + vraddhn.u16 d1, q1, q11 + vraddhn.u16 d2, q6, q12 + vraddhn.u16 d3, q7, q13 + vmvn.8 q12, q0 + vmvn.8 q13, q1 + vmull.u8 q8, d24, d4 + vmull.u8 q9, d25, d5 + vmull.u8 q10, d26, d6 + vmull.u8 q11, d27, d7 +.endm + +.macro pixman_composite_over_n_8_8_process_pixblock_tail + vrshr.u16 q14, q8, #8 + vrshr.u16 q15, q9, #8 + vrshr.u16 q12, q10, #8 + vrshr.u16 q13, q11, #8 + vraddhn.u16 d28, q14, q8 + vraddhn.u16 d29, q15, q9 + vraddhn.u16 d30, q12, q10 + vraddhn.u16 d31, q13, q11 + vqadd.u8 q14, q0, q14 + vqadd.u8 q15, q1, q15 +.endm + +/* TODO: expand macros and do better instructions scheduling */ +.macro pixman_composite_over_n_8_8_process_pixblock_tail_head + vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! + pixman_composite_over_n_8_8_process_pixblock_tail + vld1.8 {d24, d25, d26, d27}, [MASK]! + cache_preload 32, 32 + vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! + pixman_composite_over_n_8_8_process_pixblock_head +.endm + +.macro pixman_composite_over_n_8_8_init + add DUMMY, sp, #ARGS_STACK_OFFSET + vpush {d8-d15} + vld1.32 {d8[0]}, [DUMMY] + vdup.8 d8, d8[3] +.endm + +.macro pixman_composite_over_n_8_8_cleanup + vpop {d8-d15} +.endm + +generate_composite_function \ + pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \ + FLAG_DST_READWRITE, \ + 32, /* number of pixels, processed in a single block */ \ + 5, /* prefetch distance */ \ + pixman_composite_over_n_8_8_init, \ + pixman_composite_over_n_8_8_cleanup, \ + pixman_composite_over_n_8_8_process_pixblock_head, \ + pixman_composite_over_n_8_8_process_pixblock_tail, \ + pixman_composite_over_n_8_8_process_pixblock_tail_head + +/******************************************************************************/ + .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head /* * 'combine_mask_ca' replacement diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c index 2f82069..72ef75e 100644 --- a/pixman/pixman-arm-neon.c +++ b/pixman/pixman-arm-neon.c @@ -76,6 +76,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, over_n_8_8888, uint8_t, 1, uint32_t, 1) PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, over_n_8888_8888_ca, uint32_t, 1, uint32_t, 1) +PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, over_n_8_8, + uint8_t, 1, uint8_t, 1) PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, add_n_8_8, uint8_t, 1, uint8_t, 1) @@ -235,6 +237,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, x8r8g8b8, neon_composite_src_0888_8888_rev), PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, r5g6b5, neon_composite_src_0888_0565_rev), PIXMAN_STD_FAST_PATH (SRC, pixbuf, pixbuf, a8r8g8b8, neon_composite_src_pixbuf_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8, neon_composite_over_n_8_8), PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, neon_composite_over_n_8_0565), PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, neon_composite_over_n_8_0565), PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, neon_composite_over_n_8_8888), -- 2.7.4