2 * Copyright © 2009 Nokia Corporation
4 * Permission to use, copy, modify, distribute, and sell this software and its
5 * documentation for any purpose is hereby granted without fee, provided that
6 * the above copyright notice appear in all copies and that both that
7 * copyright notice and this permission notice appear in supporting
8 * documentation, and that the name of Nokia Corporation not be used in
9 * advertising or publicity pertaining to distribution of the software without
10 * specific, written prior permission. Nokia Corporation makes no
11 * representations about the suitability of this software for any purpose.
12 * It is provided "as is" without express or implied warranty.
14 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
15 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
16 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
17 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
19 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
20 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
27 * This file contains implementations of NEON optimized pixel processing
28 * functions. There is no full and detailed tutorial, but some functions
29 * (those which are exposing some new or interesting features) are
30 * extensively commented and can be used as examples.
32 * You may want to have a look at the comments for following functions:
33 * - pixman_composite_over_8888_0565_asm_neon
34 * - pixman_composite_over_n_8_0565_asm_neon
37 /* Prevent the stack from becoming executable for no reason... */
38 #if defined(__linux__) && defined(__ELF__)
39 .section .note.GNU-stack,"",%progbits
47 #include "pixman-arm-neon-asm.h"
49 /* Global configuration options and preferences */
52 * The code can optionally make use of unaligned memory accesses to improve
53 * performance of handling leading/trailing pixels for each scanline.
54 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
55 * example in linux if unaligned memory accesses are not configured to
56 * generate.exceptions.
58 .set RESPECT_STRICT_ALIGNMENT, 1
61 * Set default prefetch type. There is a choice between the following options:
63 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
64 * as NOP to workaround some HW bugs or for whatever other reason)
66 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
67 * advanced prefetch intruduces heavy overhead)
69 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
70 * which can run ARM and NEON instructions simultaneously so that extra ARM
71 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
73 * Note: some types of function can't support advanced prefetch and fallback
74 * to simple one (those which handle 24bpp pixels)
76 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
78 /* Prefetch distance in pixels for simple prefetch */
79 .set PREFETCH_DISTANCE_SIMPLE, 64
82 * Implementation of pixman_composite_over_8888_0565_asm_neon
84 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
85 * performs OVER compositing operation. Function fast_composite_over_8888_0565
86 * from pixman-fast-path.c does the same in C and can be used as a reference.
88 * First we need to have some NEON assembly code which can do the actual
89 * operation on the pixels and provide it to the template macro.
91 * Template macro quite conveniently takes care of emitting all the necessary
92 * code for memory reading and writing (including quite tricky cases of
93 * handling unaligned leading/trailing pixels), so we only need to deal with
94 * the data in NEON registers.
96 * NEON registers allocation in general is recommented to be the following:
97 * d0, d1, d2, d3 - contain loaded source pixel data
98 * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed)
99 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
100 * d28, d29, d30, d31 - place for storing the result (destination pixels)
102 * As can be seen above, four 64-bit NEON registers are used for keeping
103 * intermediate pixel data and up to 8 pixels can be processed in one step
104 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
106 * This particular function uses the following registers allocation:
107 * d0, d1, d2, d3 - contain loaded source pixel data
108 * d4, d5 - contain loaded destination pixels (they are needed)
109 * d28, d29 - place for storing the result (destination pixels)
113 * Step one. We need to have some code to do some arithmetics on pixel data.
114 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
115 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
116 * perform all the needed calculations and write the result to {d28, d29}.
117 * The rationale for having two macros and not just one will be explained
118 * later. In practice, any single monolitic function which does the work can
119 * be split into two parts in any arbitrary way without affecting correctness.
121 * There is one special trick here too. Common template macro can optionally
122 * make our life a bit easier by doing R, G, B, A color components
123 * deinterleaving for 32bpp pixel formats (and this feature is used in
124 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
125 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
126 * actually use d0 register for blue channel (a vector of eight 8-bit
127 * values), d1 register for green, d2 for red and d3 for alpha. This
128 * simple conversion can be also done with a few NEON instructions:
130 * Packed to planar conversion:
136 * Planar to packed conversion:
142 * But pixel can be loaded directly in planar format using VLD4.8 NEON
143 * instruction. It is 1 cycle slower than VLD1.32, so this is not always
144 * desirable, that's why deinterleaving is optional.
146 * But anyway, here is the code:
148 .macro pixman_composite_over_8888_0565_process_pixblock_head
149 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
150 and put data into d6 - red, d7 - green, d30 - blue */
155 vmvn.8 d3, d3 /* invert source alpha */
157 vshrn.u16 d30, q2, #2
158 /* now do alpha blending, storing results in 8-bit planar format
159 into d16 - red, d19 - green, d18 - blue */
162 vmull.u8 q12, d3, d30
163 vrshr.u16 q13, q10, #8
164 vrshr.u16 q3, q11, #8
165 vrshr.u16 q15, q12, #8
166 vraddhn.u16 d20, q10, q13
167 vraddhn.u16 d23, q11, q3
168 vraddhn.u16 d22, q12, q15
171 .macro pixman_composite_over_8888_0565_process_pixblock_tail
172 /* ... continue alpha blending */
173 vqadd.u8 d16, d2, d20
175 /* convert the result to r5g6b5 and store it into {d28, d29} */
176 vshll.u8 q14, d16, #8
180 vsri.u16 q14, q9, #11
184 * OK, now we got almost everything that we need. Using the above two
185 * macros, the work can be done right. But now we want to optimize
186 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
187 * a lot from good code scheduling and software pipelining.
189 * Let's construct some code, which will run in the core main loop.
190 * Some pseudo-code of the main loop will look like this:
198 * It may look a bit weird, but this setup allows to hide instruction
199 * latencies better and also utilize dual-issue capability more
200 * efficiently (make pairs of load-store and ALU instructions).
202 * So what we need now is a '*_tail_head' macro, which will be used
203 * in the core main loop. A trivial straightforward implementation
204 * of this macro would look like this:
206 * pixman_composite_over_8888_0565_process_pixblock_tail
207 * vst1.16 {d28, d29}, [DST_W, :128]!
208 * vld1.16 {d4, d5}, [DST_R, :128]!
209 * vld4.32 {d0, d1, d2, d3}, [SRC]!
210 * pixman_composite_over_8888_0565_process_pixblock_head
213 * Now it also got some VLD/VST instructions. We simply can't move from
214 * processing one block of pixels to the other one with just arithmetics.
215 * The previously processed data needs to be written to memory and new
216 * data needs to be fetched. Fortunately, this main loop does not deal
217 * with partial leading/trailing pixels and can load/store a full block
218 * of pixels in a bulk. Additionally, destination buffer is already
219 * 16 bytes aligned here (which is good for performance).
221 * New things here are DST_R, DST_W, SRC and MASK identifiers. These
222 * are the aliases for ARM registers which are used as pointers for
223 * accessing data. We maintain separate pointers for reading and writing
224 * destination buffer (DST_R and DST_W).
226 * Another new thing is 'cache_preload' macro. It is used for prefetching
227 * data into CPU L2 cache and improve performance when dealing with large
228 * images which are far larger than cache size. It uses one argument
229 * (actually two, but they need to be the same here) - number of pixels
230 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
231 * details about this macro. Moreover, if good performance is needed
232 * the code from this macro needs to be copied into '*_tail_head' macro
233 * and mixed with the rest of code for optimal instructions scheduling.
234 * We are actually doing it below.
236 * Now after all the explanations, here is the optimized code.
237 * Different instruction streams (originaling from '*_head', '*_tail'
238 * and 'cache_preload' macro) use different indentation levels for
239 * better readability. Actually taking the code from one of these
240 * indentation levels and ignoring a few VLD/VST instructions would
241 * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
247 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
248 vqadd.u8 d16, d2, d20
249 vld1.16 {d4, d5}, [DST_R, :128]!
252 vld4.8 {d0, d1, d2, d3}, [SRC]!
255 vshll.u8 q14, d16, #8
256 PF add PF_X, PF_X, #8
260 PF addne PF_X, PF_X, #8
262 PF subne PF_CTL, PF_CTL, #1
264 vshrn.u16 d30, q2, #2
266 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
268 vmull.u8 q12, d3, d30
269 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
273 vrshr.u16 q13, q10, #8
274 PF subge PF_X, PF_X, ORIG_W
275 vrshr.u16 q3, q11, #8
276 vrshr.u16 q15, q12, #8
277 PF subges PF_CTL, PF_CTL, #0x10
278 vsri.u16 q14, q9, #11
279 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
280 vraddhn.u16 d20, q10, q13
281 vraddhn.u16 d23, q11, q3
282 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
283 vraddhn.u16 d22, q12, q15
284 vst1.16 {d28, d29}, [DST_W, :128]!
289 /* If we did not care much about the performance, we would just use this... */
290 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
291 pixman_composite_over_8888_0565_process_pixblock_tail
292 vst1.16 {d28, d29}, [DST_W, :128]!
293 vld1.16 {d4, d5}, [DST_R, :128]!
294 vld4.32 {d0, d1, d2, d3}, [SRC]!
295 pixman_composite_over_8888_0565_process_pixblock_head
302 * And now the final part. We are using 'generate_composite_function' macro
303 * to put all the stuff together. We are specifying the name of the function
304 * which we want to get, number of bits per pixel for the source, mask and
305 * destination (0 if unused, like mask in this case). Next come some bit
307 * FLAG_DST_READWRITE - tells that the destination buffer is both read
308 * and written, for write-only buffer we would use
309 * FLAG_DST_WRITEONLY flag instead
310 * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
311 * and separate color channels for 32bpp format.
312 * The next things are:
313 * - the number of pixels processed per iteration (8 in this case, because
314 * that's the maximum what can fit into four 64-bit NEON registers).
315 * - prefetch distance, measured in pixel blocks. In this case it is 5 times
316 * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
317 * prefetch distance can be selected by running some benchmarks.
319 * After that we specify some macros, these are 'default_init',
320 * 'default_cleanup' here which are empty (but it is possible to have custom
321 * init/cleanup macros to be able to save/restore some extra NEON registers
322 * like d8-d15 or do anything else) followed by
323 * 'pixman_composite_over_8888_0565_process_pixblock_head',
324 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
325 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
326 * which we got implemented above.
328 * The last part is the NEON registers allocation scheme.
330 generate_composite_function \
331 pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
332 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
333 8, /* number of pixels, processed in a single block */ \
334 5, /* prefetch distance */ \
337 pixman_composite_over_8888_0565_process_pixblock_head, \
338 pixman_composite_over_8888_0565_process_pixblock_tail, \
339 pixman_composite_over_8888_0565_process_pixblock_tail_head, \
340 28, /* dst_w_basereg */ \
341 4, /* dst_r_basereg */ \
342 0, /* src_basereg */ \
343 24 /* mask_basereg */
345 /******************************************************************************/
347 .macro pixman_composite_src_8888_0565_process_pixblock_head
353 .macro pixman_composite_src_8888_0565_process_pixblock_tail
355 vsri.u16 q14, q9, #11
358 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
360 PF add PF_X, PF_X, #8
362 vld4.8 {d0, d1, d2, d3}, [SRC]!
363 PF addne PF_X, PF_X, #8
364 PF subne PF_CTL, PF_CTL, #1
365 vsri.u16 q14, q9, #11
367 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
369 vst1.16 {d28, d29}, [DST_W, :128]!
370 PF subge PF_X, PF_X, ORIG_W
371 PF subges PF_CTL, PF_CTL, #0x10
373 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
377 generate_composite_function \
378 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
379 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
380 8, /* number of pixels, processed in a single block */ \
381 10, /* prefetch distance */ \
384 pixman_composite_src_8888_0565_process_pixblock_head, \
385 pixman_composite_src_8888_0565_process_pixblock_tail, \
386 pixman_composite_src_8888_0565_process_pixblock_tail_head
388 /******************************************************************************/
390 .macro pixman_composite_add_8000_8000_process_pixblock_head
395 .macro pixman_composite_add_8000_8000_process_pixblock_tail
398 .macro pixman_composite_add_8000_8000_process_pixblock_tail_head
399 vld1.8 {d0, d1, d2, d3}, [SRC]!
400 PF add PF_X, PF_X, #32
402 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
403 PF addne PF_X, PF_X, #32
404 PF subne PF_CTL, PF_CTL, #1
405 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
407 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
408 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
409 PF subge PF_X, PF_X, ORIG_W
410 PF subges PF_CTL, PF_CTL, #0x10
412 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
413 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
417 generate_composite_function \
418 pixman_composite_add_8000_8000_asm_neon, 8, 0, 8, \
419 FLAG_DST_READWRITE, \
420 32, /* number of pixels, processed in a single block */ \
421 10, /* prefetch distance */ \
424 pixman_composite_add_8000_8000_process_pixblock_head, \
425 pixman_composite_add_8000_8000_process_pixblock_tail, \
426 pixman_composite_add_8000_8000_process_pixblock_tail_head
428 /******************************************************************************/
430 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
431 vld1.8 {d0, d1, d2, d3}, [SRC]!
432 PF add PF_X, PF_X, #8
434 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
435 PF addne PF_X, PF_X, #8
436 PF subne PF_CTL, PF_CTL, #1
437 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
439 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
440 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
441 PF subge PF_X, PF_X, ORIG_W
442 PF subges PF_CTL, PF_CTL, #0x10
444 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
445 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
449 generate_composite_function \
450 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
451 FLAG_DST_READWRITE, \
452 8, /* number of pixels, processed in a single block */ \
453 10, /* prefetch distance */ \
456 pixman_composite_add_8000_8000_process_pixblock_head, \
457 pixman_composite_add_8000_8000_process_pixblock_tail, \
458 pixman_composite_add_8888_8888_process_pixblock_tail_head
460 /******************************************************************************/
462 .macro pixman_composite_over_8888_8888_process_pixblock_head
463 vmvn.8 d24, d3 /* get inverted alpha */
464 /* do alpha blending */
467 vmull.u8 q10, d24, d6
468 vmull.u8 q11, d24, d7
471 .macro pixman_composite_over_8888_8888_process_pixblock_tail
472 vrshr.u16 q14, q8, #8
473 vrshr.u16 q15, q9, #8
474 vrshr.u16 q12, q10, #8
475 vrshr.u16 q13, q11, #8
476 vraddhn.u16 d28, q14, q8
477 vraddhn.u16 d29, q15, q9
478 vraddhn.u16 d30, q12, q10
479 vraddhn.u16 d31, q13, q11
480 vqadd.u8 q14, q0, q14
481 vqadd.u8 q15, q1, q15
484 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
485 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
486 vrshr.u16 q14, q8, #8
487 PF add PF_X, PF_X, #8
489 vrshr.u16 q15, q9, #8
490 vrshr.u16 q12, q10, #8
491 vrshr.u16 q13, q11, #8
492 PF addne PF_X, PF_X, #8
493 PF subne PF_CTL, PF_CTL, #1
494 vraddhn.u16 d28, q14, q8
495 vraddhn.u16 d29, q15, q9
497 vraddhn.u16 d30, q12, q10
498 vraddhn.u16 d31, q13, q11
499 vqadd.u8 q14, q0, q14
500 vqadd.u8 q15, q1, q15
501 vld4.8 {d0, d1, d2, d3}, [SRC]!
502 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
504 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
505 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
506 PF subge PF_X, PF_X, ORIG_W
508 PF subges PF_CTL, PF_CTL, #0x10
510 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
511 vmull.u8 q10, d22, d6
512 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
513 vmull.u8 q11, d22, d7
516 generate_composite_function \
517 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
518 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
519 8, /* number of pixels, processed in a single block */ \
520 5, /* prefetch distance */ \
523 pixman_composite_over_8888_8888_process_pixblock_head, \
524 pixman_composite_over_8888_8888_process_pixblock_tail, \
525 pixman_composite_over_8888_8888_process_pixblock_tail_head
527 /******************************************************************************/
529 .macro pixman_composite_over_n_8_0565_process_pixblock_head
533 vmull.u8 q6, d24, d10
534 vmull.u8 q7, d24, d11
535 vrshr.u16 q10, q0, #8
536 vrshr.u16 q11, q1, #8
537 vrshr.u16 q12, q6, #8
538 vrshr.u16 q13, q7, #8
539 vraddhn.u16 d0, q0, q10
540 vraddhn.u16 d1, q1, q11
541 vraddhn.u16 d2, q6, q12
542 vraddhn.u16 d3, q7, q13
550 vshrn.u16 d30, q2, #2
551 /* now do alpha blending */
554 vmull.u8 q12, d3, d30
555 vrshr.u16 q13, q10, #8
556 vrshr.u16 q3, q11, #8
557 vrshr.u16 q15, q12, #8
558 vraddhn.u16 d20, q10, q13
559 vraddhn.u16 d23, q11, q3
560 vraddhn.u16 d22, q12, q15
563 .macro pixman_composite_over_n_8_0565_process_pixblock_tail
564 vqadd.u8 d16, d2, d20
566 /* convert to r5g6b5 */
567 vshll.u8 q14, d16, #8
571 vsri.u16 q14, q9, #11
574 /* TODO: expand macros and do better instructions scheduling */
575 .macro pixman_composite_over_n_8_0565_process_pixblock_tail_head
576 pixman_composite_over_n_8_0565_process_pixblock_tail
577 vst1.16 {d28, d29}, [DST_W, :128]!
578 vld1.16 {d4, d5}, [DST_R, :128]!
579 vld1.8 {d24}, [MASK]!
581 pixman_composite_over_n_8_0565_process_pixblock_head
585 * This function needs a special initialization of solid mask.
586 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
587 * offset, split into color components and replicated in d8-d11
588 * registers. Additionally, this function needs all the NEON registers,
589 * so it has to save d8-d15 registers which are callee saved according
590 * to ABI. These registers are restored from 'cleanup' macro. All the
591 * other NEON registers are caller saved, so can be clobbered freely
592 * without introducing any problems.
594 .macro pixman_composite_over_n_8_0565_init
595 add DUMMY, sp, #ARGS_STACK_OFFSET
597 vld1.32 {d11[0]}, [DUMMY]
604 .macro pixman_composite_over_n_8_0565_cleanup
608 generate_composite_function \
609 pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
610 FLAG_DST_READWRITE, \
611 8, /* number of pixels, processed in a single block */ \
612 5, /* prefetch distance */ \
613 pixman_composite_over_n_8_0565_init, \
614 pixman_composite_over_n_8_0565_cleanup, \
615 pixman_composite_over_n_8_0565_process_pixblock_head, \
616 pixman_composite_over_n_8_0565_process_pixblock_tail, \
617 pixman_composite_over_n_8_0565_process_pixblock_tail_head
619 /******************************************************************************/
621 .macro pixman_composite_src_0565_0565_process_pixblock_head
624 .macro pixman_composite_src_0565_0565_process_pixblock_tail
627 .macro pixman_composite_src_0565_0565_process_pixblock_tail_head
628 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
629 vld1.16 {d0, d1, d2, d3}, [SRC]!
633 generate_composite_function \
634 pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
635 FLAG_DST_WRITEONLY, \
636 16, /* number of pixels, processed in a single block */ \
637 10, /* prefetch distance */ \
640 pixman_composite_src_0565_0565_process_pixblock_head, \
641 pixman_composite_src_0565_0565_process_pixblock_tail, \
642 pixman_composite_src_0565_0565_process_pixblock_tail_head, \
643 0, /* dst_w_basereg */ \
644 0, /* dst_r_basereg */ \
645 0, /* src_basereg */ \
648 /******************************************************************************/
650 .macro pixman_composite_src_n_8_process_pixblock_head
653 .macro pixman_composite_src_n_8_process_pixblock_tail
656 .macro pixman_composite_src_n_8_process_pixblock_tail_head
657 vst1.8 {d0, d1, d2, d3}, [DST_W, :128]!
660 .macro pixman_composite_src_n_8_init
661 add DUMMY, sp, #ARGS_STACK_OFFSET
662 vld1.32 {d0[0]}, [DUMMY]
670 .macro pixman_composite_src_n_8_cleanup
673 generate_composite_function \
674 pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
675 FLAG_DST_WRITEONLY, \
676 32, /* number of pixels, processed in a single block */ \
677 0, /* prefetch distance */ \
678 pixman_composite_src_n_8_init, \
679 pixman_composite_src_n_8_cleanup, \
680 pixman_composite_src_n_8_process_pixblock_head, \
681 pixman_composite_src_n_8_process_pixblock_tail, \
682 pixman_composite_src_n_8_process_pixblock_tail_head, \
683 0, /* dst_w_basereg */ \
684 0, /* dst_r_basereg */ \
685 0, /* src_basereg */ \
688 /******************************************************************************/
690 .macro pixman_composite_src_n_0565_process_pixblock_head
693 .macro pixman_composite_src_n_0565_process_pixblock_tail
696 .macro pixman_composite_src_n_0565_process_pixblock_tail_head
697 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
700 .macro pixman_composite_src_n_0565_init
701 add DUMMY, sp, #ARGS_STACK_OFFSET
702 vld1.32 {d0[0]}, [DUMMY]
709 .macro pixman_composite_src_n_0565_cleanup
712 generate_composite_function \
713 pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
714 FLAG_DST_WRITEONLY, \
715 16, /* number of pixels, processed in a single block */ \
716 0, /* prefetch distance */ \
717 pixman_composite_src_n_0565_init, \
718 pixman_composite_src_n_0565_cleanup, \
719 pixman_composite_src_n_0565_process_pixblock_head, \
720 pixman_composite_src_n_0565_process_pixblock_tail, \
721 pixman_composite_src_n_0565_process_pixblock_tail_head, \
722 0, /* dst_w_basereg */ \
723 0, /* dst_r_basereg */ \
724 0, /* src_basereg */ \
727 /******************************************************************************/
729 .macro pixman_composite_src_n_8888_process_pixblock_head
732 .macro pixman_composite_src_n_8888_process_pixblock_tail
735 .macro pixman_composite_src_n_8888_process_pixblock_tail_head
736 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
739 .macro pixman_composite_src_n_8888_init
740 add DUMMY, sp, #ARGS_STACK_OFFSET
741 vld1.32 {d0[0]}, [DUMMY]
747 .macro pixman_composite_src_n_8888_cleanup
750 generate_composite_function \
751 pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
752 FLAG_DST_WRITEONLY, \
753 8, /* number of pixels, processed in a single block */ \
754 0, /* prefetch distance */ \
755 pixman_composite_src_n_8888_init, \
756 pixman_composite_src_n_8888_cleanup, \
757 pixman_composite_src_n_8888_process_pixblock_head, \
758 pixman_composite_src_n_8888_process_pixblock_tail, \
759 pixman_composite_src_n_8888_process_pixblock_tail_head, \
760 0, /* dst_w_basereg */ \
761 0, /* dst_r_basereg */ \
762 0, /* src_basereg */ \
765 /******************************************************************************/
767 .macro pixman_composite_src_8888_8888_process_pixblock_head
770 .macro pixman_composite_src_8888_8888_process_pixblock_tail
773 .macro pixman_composite_src_8888_8888_process_pixblock_tail_head
774 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
775 vld1.32 {d0, d1, d2, d3}, [SRC]!
779 generate_composite_function \
780 pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
781 FLAG_DST_WRITEONLY, \
782 8, /* number of pixels, processed in a single block */ \
783 10, /* prefetch distance */ \
786 pixman_composite_src_8888_8888_process_pixblock_head, \
787 pixman_composite_src_8888_8888_process_pixblock_tail, \
788 pixman_composite_src_8888_8888_process_pixblock_tail_head, \
789 0, /* dst_w_basereg */ \
790 0, /* dst_r_basereg */ \
791 0, /* src_basereg */ \
794 /******************************************************************************/
796 .macro pixman_composite_over_n_8_8888_process_pixblock_head
797 /* expecting deinterleaved source data in {d8, d9, d10, d11} */
798 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
799 /* and destination data in {d4, d5, d6, d7} */
800 /* mask is in d24 (d25, d26, d27 are unused) */
805 vmull.u8 q6, d24, d10
806 vmull.u8 q7, d24, d11
807 vrshr.u16 q10, q0, #8
808 vrshr.u16 q11, q1, #8
809 vrshr.u16 q12, q6, #8
810 vrshr.u16 q13, q7, #8
811 vraddhn.u16 d0, q0, q10
812 vraddhn.u16 d1, q1, q11
813 vraddhn.u16 d2, q6, q12
814 vraddhn.u16 d3, q7, q13
815 vmvn.8 d24, d3 /* get inverted alpha */
816 /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */
817 /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
818 /* now do alpha blending */
821 vmull.u8 q10, d24, d6
822 vmull.u8 q11, d24, d7
825 .macro pixman_composite_over_n_8_8888_process_pixblock_tail
826 vrshr.u16 q14, q8, #8
827 vrshr.u16 q15, q9, #8
828 vrshr.u16 q12, q10, #8
829 vrshr.u16 q13, q11, #8
830 vraddhn.u16 d28, q14, q8
831 vraddhn.u16 d29, q15, q9
832 vraddhn.u16 d30, q12, q10
833 vraddhn.u16 d31, q13, q11
834 vqadd.u8 q14, q0, q14
835 vqadd.u8 q15, q1, q15
838 /* TODO: expand macros and do better instructions scheduling */
839 .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
840 pixman_composite_over_n_8_8888_process_pixblock_tail
841 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
842 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
843 vld1.8 {d24}, [MASK]!
845 pixman_composite_over_n_8_8888_process_pixblock_head
848 .macro pixman_composite_over_n_8_8888_init
849 add DUMMY, sp, #ARGS_STACK_OFFSET
851 vld1.32 {d11[0]}, [DUMMY]
858 .macro pixman_composite_over_n_8_8888_cleanup
862 generate_composite_function \
863 pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
864 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
865 8, /* number of pixels, processed in a single block */ \
866 5, /* prefetch distance */ \
867 pixman_composite_over_n_8_8888_init, \
868 pixman_composite_over_n_8_8888_cleanup, \
869 pixman_composite_over_n_8_8888_process_pixblock_head, \
870 pixman_composite_over_n_8_8888_process_pixblock_tail, \
871 pixman_composite_over_n_8_8888_process_pixblock_tail_head
873 /******************************************************************************/
875 .macro pixman_composite_add_n_8_8_process_pixblock_head
876 /* expecting source data in {d8, d9, d10, d11} */
877 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
878 /* and destination data in {d4, d5, d6, d7} */
879 /* mask is in d24, d25, d26, d27 */
880 vmull.u8 q0, d24, d11
881 vmull.u8 q1, d25, d11
882 vmull.u8 q6, d26, d11
883 vmull.u8 q7, d27, d11
884 vrshr.u16 q10, q0, #8
885 vrshr.u16 q11, q1, #8
886 vrshr.u16 q12, q6, #8
887 vrshr.u16 q13, q7, #8
888 vraddhn.u16 d0, q0, q10
889 vraddhn.u16 d1, q1, q11
890 vraddhn.u16 d2, q6, q12
891 vraddhn.u16 d3, q7, q13
896 .macro pixman_composite_add_n_8_8_process_pixblock_tail
899 /* TODO: expand macros and do better instructions scheduling */
900 .macro pixman_composite_add_n_8_8_process_pixblock_tail_head
901 pixman_composite_add_n_8_8_process_pixblock_tail
902 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
903 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
904 vld1.8 {d24, d25, d26, d27}, [MASK]!
906 pixman_composite_add_n_8_8_process_pixblock_head
909 .macro pixman_composite_add_n_8_8_init
910 add DUMMY, sp, #ARGS_STACK_OFFSET
912 vld1.32 {d11[0]}, [DUMMY]
916 .macro pixman_composite_add_n_8_8_cleanup
920 generate_composite_function \
921 pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
922 FLAG_DST_READWRITE, \
923 32, /* number of pixels, processed in a single block */ \
924 5, /* prefetch distance */ \
925 pixman_composite_add_n_8_8_init, \
926 pixman_composite_add_n_8_8_cleanup, \
927 pixman_composite_add_n_8_8_process_pixblock_head, \
928 pixman_composite_add_n_8_8_process_pixblock_tail, \
929 pixman_composite_add_n_8_8_process_pixblock_tail_head
931 /******************************************************************************/
933 .macro pixman_composite_add_8_8_8_process_pixblock_head
934 /* expecting source data in {d0, d1, d2, d3} */
935 /* destination data in {d4, d5, d6, d7} */
936 /* mask in {d24, d25, d26, d27} */
939 vmull.u8 q10, d26, d2
940 vmull.u8 q11, d27, d3
943 vrshr.u16 q12, q10, #8
944 vrshr.u16 q13, q11, #8
945 vraddhn.u16 d0, q0, q8
946 vraddhn.u16 d1, q1, q9
947 vraddhn.u16 d2, q12, q10
948 vraddhn.u16 d3, q13, q11
953 .macro pixman_composite_add_8_8_8_process_pixblock_tail
956 /* TODO: expand macros and do better instructions scheduling */
957 .macro pixman_composite_add_8_8_8_process_pixblock_tail_head
958 pixman_composite_add_8_8_8_process_pixblock_tail
959 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
960 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
961 vld1.8 {d24, d25, d26, d27}, [MASK]!
962 vld1.8 {d0, d1, d2, d3}, [SRC]!
964 pixman_composite_add_8_8_8_process_pixblock_head
967 .macro pixman_composite_add_8_8_8_init
970 .macro pixman_composite_add_8_8_8_cleanup
973 generate_composite_function \
974 pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
975 FLAG_DST_READWRITE, \
976 32, /* number of pixels, processed in a single block */ \
977 5, /* prefetch distance */ \
978 pixman_composite_add_8_8_8_init, \
979 pixman_composite_add_8_8_8_cleanup, \
980 pixman_composite_add_8_8_8_process_pixblock_head, \
981 pixman_composite_add_8_8_8_process_pixblock_tail, \
982 pixman_composite_add_8_8_8_process_pixblock_tail_head
984 /******************************************************************************/
986 .macro pixman_composite_add_8888_8888_8888_process_pixblock_head
987 /* expecting source data in {d0, d1, d2, d3} */
988 /* destination data in {d4, d5, d6, d7} */
989 /* mask in {d24, d25, d26, d27} */
992 vmull.u8 q10, d27, d2
993 vmull.u8 q11, d27, d3
996 vrshr.u16 q12, q10, #8
997 vrshr.u16 q13, q11, #8
998 vraddhn.u16 d0, q0, q8
999 vraddhn.u16 d1, q1, q9
1000 vraddhn.u16 d2, q12, q10
1001 vraddhn.u16 d3, q13, q11
1002 vqadd.u8 q14, q0, q2
1003 vqadd.u8 q15, q1, q3
1006 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
1009 /* TODO: expand macros and do better instructions scheduling */
1010 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1011 pixman_composite_add_8888_8888_8888_process_pixblock_tail
1012 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1013 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1014 vld4.8 {d24, d25, d26, d27}, [MASK]!
1015 vld4.8 {d0, d1, d2, d3}, [SRC]!
1017 pixman_composite_add_8888_8888_8888_process_pixblock_head
1020 generate_composite_function \
1021 pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
1022 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1023 8, /* number of pixels, processed in a single block */ \
1024 10, /* prefetch distance */ \
1027 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1028 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1029 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1031 /******************************************************************************/
1033 .macro pixman_composite_over_8888_n_8888_process_pixblock_head
1034 /* expecting source data in {d0, d1, d2, d3} */
1035 /* destination data in {d4, d5, d6, d7} */
1036 /* solid mask is in d15 */
1039 vmull.u8 q8, d15, d3
1040 vmull.u8 q6, d15, d2
1041 vmull.u8 q5, d15, d1
1042 vmull.u8 q4, d15, d0
1043 vrshr.u16 q13, q8, #8
1044 vrshr.u16 q12, q6, #8
1045 vrshr.u16 q11, q5, #8
1046 vrshr.u16 q10, q4, #8
1047 vraddhn.u16 d3, q8, q13
1048 vraddhn.u16 d2, q6, q12
1049 vraddhn.u16 d1, q5, q11
1050 vraddhn.u16 d0, q4, q10
1051 vmvn.8 d24, d3 /* get inverted alpha */
1052 /* now do alpha blending */
1053 vmull.u8 q8, d24, d4
1054 vmull.u8 q9, d24, d5
1055 vmull.u8 q10, d24, d6
1056 vmull.u8 q11, d24, d7
1059 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail
1060 vrshr.u16 q14, q8, #8
1061 vrshr.u16 q15, q9, #8
1062 vrshr.u16 q12, q10, #8
1063 vrshr.u16 q13, q11, #8
1064 vraddhn.u16 d28, q14, q8
1065 vraddhn.u16 d29, q15, q9
1066 vraddhn.u16 d30, q12, q10
1067 vraddhn.u16 d31, q13, q11
1068 vqadd.u8 q14, q0, q14
1069 vqadd.u8 q15, q1, q15
1072 /* TODO: expand macros and do better instructions scheduling */
1073 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
1074 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1075 pixman_composite_over_8888_n_8888_process_pixblock_tail
1076 vld4.8 {d0, d1, d2, d3}, [SRC]!
1078 pixman_composite_over_8888_n_8888_process_pixblock_head
1079 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1082 .macro pixman_composite_over_8888_n_8888_init
1085 vld1.32 {d15[0]}, [DUMMY]
1089 .macro pixman_composite_over_8888_n_8888_cleanup
1093 generate_composite_function \
1094 pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
1095 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1096 8, /* number of pixels, processed in a single block */ \
1097 5, /* prefetch distance */ \
1098 pixman_composite_over_8888_n_8888_init, \
1099 pixman_composite_over_8888_n_8888_cleanup, \
1100 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1101 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1102 pixman_composite_over_8888_n_8888_process_pixblock_tail_head
1104 /******************************************************************************/
1106 /* TODO: expand macros and do better instructions scheduling */
1107 .macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
1108 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1109 pixman_composite_over_8888_n_8888_process_pixblock_tail
1110 vld4.8 {d0, d1, d2, d3}, [SRC]!
1112 vld4.8 {d12, d13, d14, d15}, [MASK]!
1113 pixman_composite_over_8888_n_8888_process_pixblock_head
1114 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1117 .macro pixman_composite_over_8888_8888_8888_init
1121 .macro pixman_composite_over_8888_8888_8888_cleanup
1125 generate_composite_function \
1126 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
1127 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1128 8, /* number of pixels, processed in a single block */ \
1129 5, /* prefetch distance */ \
1130 pixman_composite_over_8888_8888_8888_init, \
1131 pixman_composite_over_8888_8888_8888_cleanup, \
1132 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1133 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1134 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
1135 28, /* dst_w_basereg */ \
1136 4, /* dst_r_basereg */ \
1137 0, /* src_basereg */ \
1138 12 /* mask_basereg */
1140 /******************************************************************************/
1142 /* TODO: expand macros and do better instructions scheduling */
1143 .macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
1144 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1145 pixman_composite_over_8888_n_8888_process_pixblock_tail
1146 vld4.8 {d0, d1, d2, d3}, [SRC]!
1148 vld1.8 {d15}, [MASK]!
1149 pixman_composite_over_8888_n_8888_process_pixblock_head
1150 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1153 .macro pixman_composite_over_8888_8_8888_init
1157 .macro pixman_composite_over_8888_8_8888_cleanup
1161 generate_composite_function \
1162 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
1163 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1164 8, /* number of pixels, processed in a single block */ \
1165 5, /* prefetch distance */ \
1166 pixman_composite_over_8888_8_8888_init, \
1167 pixman_composite_over_8888_8_8888_cleanup, \
1168 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1169 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1170 pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
1171 28, /* dst_w_basereg */ \
1172 4, /* dst_r_basereg */ \
1173 0, /* src_basereg */ \
1174 15 /* mask_basereg */
1176 /******************************************************************************/
1178 .macro pixman_composite_src_0888_0888_process_pixblock_head
1181 .macro pixman_composite_src_0888_0888_process_pixblock_tail
1184 .macro pixman_composite_src_0888_0888_process_pixblock_tail_head
1185 vst3.8 {d0, d1, d2}, [DST_W]!
1186 vld3.8 {d0, d1, d2}, [SRC]!
1190 generate_composite_function \
1191 pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
1192 FLAG_DST_WRITEONLY, \
1193 8, /* number of pixels, processed in a single block */ \
1194 10, /* prefetch distance */ \
1197 pixman_composite_src_0888_0888_process_pixblock_head, \
1198 pixman_composite_src_0888_0888_process_pixblock_tail, \
1199 pixman_composite_src_0888_0888_process_pixblock_tail_head, \
1200 0, /* dst_w_basereg */ \
1201 0, /* dst_r_basereg */ \
1202 0, /* src_basereg */ \
1203 0 /* mask_basereg */