/******************************************************************************/
+.macro pixman_composite_over_n_8_8_process_pixblock_head
+ vmull.u8 q0, d24, d8
+ vmull.u8 q1, d25, d8
+ vmull.u8 q6, d26, d8
+ vmull.u8 q7, d27, d8
+ vrshr.u16 q10, q0, #8
+ vrshr.u16 q11, q1, #8
+ vrshr.u16 q12, q6, #8
+ vrshr.u16 q13, q7, #8
+ vraddhn.u16 d0, q0, q10
+ vraddhn.u16 d1, q1, q11
+ vraddhn.u16 d2, q6, q12
+ vraddhn.u16 d3, q7, q13
+ vmvn.8 q12, q0
+ vmvn.8 q13, q1
+ vmull.u8 q8, d24, d4
+ vmull.u8 q9, d25, d5
+ vmull.u8 q10, d26, d6
+ vmull.u8 q11, d27, d7
+.endm
+
+.macro pixman_composite_over_n_8_8_process_pixblock_tail
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
+ vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ pixman_composite_over_n_8_8_process_pixblock_tail
+ vld1.8 {d24, d25, d26, d27}, [MASK]!
+ cache_preload 32, 32
+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ pixman_composite_over_n_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_n_8_8_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vpush {d8-d15}
+ vld1.32 {d8[0]}, [DUMMY]
+ vdup.8 d8, d8[3]
+.endm
+
+.macro pixman_composite_over_n_8_8_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
+ FLAG_DST_READWRITE, \
+ 32, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_8_8_init, \
+ pixman_composite_over_n_8_8_cleanup, \
+ pixman_composite_over_n_8_8_process_pixblock_head, \
+ pixman_composite_over_n_8_8_process_pixblock_tail, \
+ pixman_composite_over_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
/*
* 'combine_mask_ca' replacement
uint8_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, over_n_8888_8888_ca,
uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, over_n_8_8,
+ uint8_t, 1, uint8_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, add_n_8_8,
uint8_t, 1, uint8_t, 1)
PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, x8r8g8b8, neon_composite_src_0888_8888_rev),
PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, r5g6b5, neon_composite_src_0888_0565_rev),
PIXMAN_STD_FAST_PATH (SRC, pixbuf, pixbuf, a8r8g8b8, neon_composite_src_pixbuf_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8, neon_composite_over_n_8_8),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, neon_composite_over_n_8_0565),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, neon_composite_over_n_8_0565),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, neon_composite_over_n_8_8888),