2 * Copyright © 2009 Nokia Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
27 * This file contains implementations of NEON optimized pixel processing
28 * functions. There is no full and detailed tutorial, but some functions
29 * (those which are exposing some new or interesting features) are
30 * extensively commented and can be used as examples.
32 * You may want to have a look at the comments for following functions:
33 * - pixman_composite_over_8888_0565_asm_neon
34 * - pixman_composite_over_n_8_0565_asm_neon
37 /* Prevent the stack from becoming executable for no reason... */
38 #if defined(__linux__) && defined(__ELF__)
39 .section .note.GNU-stack,"",%progbits
46 .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
47 .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
51 #include "pixman-arm-neon-asm.h"
53 /* Global configuration options and preferences */
56 * The code can optionally make use of unaligned memory accesses to improve
57 * performance of handling leading/trailing pixels for each scanline.
58 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
59 * example in linux if unaligned memory accesses are not configured to
60 * generate.exceptions.
62 .set RESPECT_STRICT_ALIGNMENT, 1
65 * Set default prefetch type. There is a choice between the following options:
67 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
68 * as NOP to workaround some HW bugs or for whatever other reason)
70 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
71 * advanced prefetch intruduces heavy overhead)
73 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
74 * which can run ARM and NEON instructions simultaneously so that extra ARM
75 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
77 * Note: some types of function can't support advanced prefetch and fallback
78 * to simple one (those which handle 24bpp pixels)
80 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
82 /* Prefetch distance in pixels for simple prefetch */
83 .set PREFETCH_DISTANCE_SIMPLE, 64
86 * Implementation of pixman_composite_over_8888_0565_asm_neon
88 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
89 * performs OVER compositing operation. Function fast_composite_over_8888_0565
90 * from pixman-fast-path.c does the same in C and can be used as a reference.
92 * First we need to have some NEON assembly code which can do the actual
93 * operation on the pixels and provide it to the template macro.
95 * Template macro quite conveniently takes care of emitting all the necessary
96 * code for memory reading and writing (including quite tricky cases of
97 * handling unaligned leading/trailing pixels), so we only need to deal with
98 * the data in NEON registers.
100 * NEON registers allocation in general is recommented to be the following:
101 * d0, d1, d2, d3 - contain loaded source pixel data
102 * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed)
103 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
104 * d28, d29, d30, d31 - place for storing the result (destination pixels)
106 * As can be seen above, four 64-bit NEON registers are used for keeping
107 * intermediate pixel data and up to 8 pixels can be processed in one step
108 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
110 * This particular function uses the following registers allocation:
111 * d0, d1, d2, d3 - contain loaded source pixel data
112 * d4, d5 - contain loaded destination pixels (they are needed)
113 * d28, d29 - place for storing the result (destination pixels)
117 * Step one. We need to have some code to do some arithmetics on pixel data.
118 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
119 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
120 * perform all the needed calculations and write the result to {d28, d29}.
121 * The rationale for having two macros and not just one will be explained
122 * later. In practice, any single monolitic function which does the work can
123 * be split into two parts in any arbitrary way without affecting correctness.
125 * There is one special trick here too. Common template macro can optionally
126 * make our life a bit easier by doing R, G, B, A color components
127 * deinterleaving for 32bpp pixel formats (and this feature is used in
128 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
129 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
130 * actually use d0 register for blue channel (a vector of eight 8-bit
131 * values), d1 register for green, d2 for red and d3 for alpha. This
132 * simple conversion can be also done with a few NEON instructions:
134 * Packed to planar conversion:
140 * Planar to packed conversion:
146 * But pixel can be loaded directly in planar format using VLD4.8 NEON
147 * instruction. It is 1 cycle slower than VLD1.32, so this is not always
148 * desirable, that's why deinterleaving is optional.
150 * But anyway, here is the code:
152 .macro pixman_composite_over_8888_0565_process_pixblock_head
153 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
154 and put data into d6 - red, d7 - green, d30 - blue */
159 vmvn.8 d3, d3 /* invert source alpha */
161 vshrn.u16 d30, q2, #2
162 /* now do alpha blending, storing results in 8-bit planar format
163 into d16 - red, d19 - green, d18 - blue */
166 vmull.u8 q12, d3, d30
167 vrshr.u16 q13, q10, #8
168 vrshr.u16 q3, q11, #8
169 vrshr.u16 q15, q12, #8
170 vraddhn.u16 d20, q10, q13
171 vraddhn.u16 d23, q11, q3
172 vraddhn.u16 d22, q12, q15
175 .macro pixman_composite_over_8888_0565_process_pixblock_tail
176 /* ... continue alpha blending */
177 vqadd.u8 d16, d2, d20
179 /* convert the result to r5g6b5 and store it into {d28, d29} */
180 vshll.u8 q14, d16, #8
184 vsri.u16 q14, q9, #11
188 * OK, now we got almost everything that we need. Using the above two
189 * macros, the work can be done right. But now we want to optimize
190 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
191 * a lot from good code scheduling and software pipelining.
193 * Let's construct some code, which will run in the core main loop.
194 * Some pseudo-code of the main loop will look like this:
202 * It may look a bit weird, but this setup allows to hide instruction
203 * latencies better and also utilize dual-issue capability more
204 * efficiently (make pairs of load-store and ALU instructions).
206 * So what we need now is a '*_tail_head' macro, which will be used
207 * in the core main loop. A trivial straightforward implementation
208 * of this macro would look like this:
210 * pixman_composite_over_8888_0565_process_pixblock_tail
211 * vst1.16 {d28, d29}, [DST_W, :128]!
212 * vld1.16 {d4, d5}, [DST_R, :128]!
213 * vld4.32 {d0, d1, d2, d3}, [SRC]!
214 * pixman_composite_over_8888_0565_process_pixblock_head
217 * Now it also got some VLD/VST instructions. We simply can't move from
218 * processing one block of pixels to the other one with just arithmetics.
219 * The previously processed data needs to be written to memory and new
220 * data needs to be fetched. Fortunately, this main loop does not deal
221 * with partial leading/trailing pixels and can load/store a full block
222 * of pixels in a bulk. Additionally, destination buffer is already
223 * 16 bytes aligned here (which is good for performance).
225 * New things here are DST_R, DST_W, SRC and MASK identifiers. These
226 * are the aliases for ARM registers which are used as pointers for
227 * accessing data. We maintain separate pointers for reading and writing
228 * destination buffer (DST_R and DST_W).
230 * Another new thing is 'cache_preload' macro. It is used for prefetching
231 * data into CPU L2 cache and improve performance when dealing with large
232 * images which are far larger than cache size. It uses one argument
233 * (actually two, but they need to be the same here) - number of pixels
234 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
235 * details about this macro. Moreover, if good performance is needed
236 * the code from this macro needs to be copied into '*_tail_head' macro
237 * and mixed with the rest of code for optimal instructions scheduling.
238 * We are actually doing it below.
240 * Now after all the explanations, here is the optimized code.
241 * Different instruction streams (originaling from '*_head', '*_tail'
242 * and 'cache_preload' macro) use different indentation levels for
243 * better readability. Actually taking the code from one of these
244 * indentation levels and ignoring a few VLD/VST instructions would
245 * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
251 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
252 vqadd.u8 d16, d2, d20
253 vld1.16 {d4, d5}, [DST_R, :128]!
256 vld4.8 {d0, d1, d2, d3}, [SRC]!
259 vshll.u8 q14, d16, #8
260 PF add PF_X, PF_X, #8
264 PF addne PF_X, PF_X, #8
266 PF subne PF_CTL, PF_CTL, #1
268 vshrn.u16 d30, q2, #2
270 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
272 vmull.u8 q12, d3, d30
273 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
277 vrshr.u16 q13, q10, #8
278 PF subge PF_X, PF_X, ORIG_W
279 vrshr.u16 q3, q11, #8
280 vrshr.u16 q15, q12, #8
281 PF subges PF_CTL, PF_CTL, #0x10
282 vsri.u16 q14, q9, #11
283 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
284 vraddhn.u16 d20, q10, q13
285 vraddhn.u16 d23, q11, q3
286 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
287 vraddhn.u16 d22, q12, q15
288 vst1.16 {d28, d29}, [DST_W, :128]!
293 /* If we did not care much about the performance, we would just use this... */
294 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
295 pixman_composite_over_8888_0565_process_pixblock_tail
296 vst1.16 {d28, d29}, [DST_W, :128]!
297 vld1.16 {d4, d5}, [DST_R, :128]!
298 vld4.32 {d0, d1, d2, d3}, [SRC]!
299 pixman_composite_over_8888_0565_process_pixblock_head
306 * And now the final part. We are using 'generate_composite_function' macro
307 * to put all the stuff together. We are specifying the name of the function
308 * which we want to get, number of bits per pixel for the source, mask and
309 * destination (0 if unused, like mask in this case). Next come some bit
311 * FLAG_DST_READWRITE - tells that the destination buffer is both read
312 * and written, for write-only buffer we would use
313 * FLAG_DST_WRITEONLY flag instead
314 * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
315 * and separate color channels for 32bpp format.
316 * The next things are:
317 * - the number of pixels processed per iteration (8 in this case, because
318 * that's the maximum what can fit into four 64-bit NEON registers).
319 * - prefetch distance, measured in pixel blocks. In this case it is 5 times
320 * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
321 * prefetch distance can be selected by running some benchmarks.
323 * After that we specify some macros, these are 'default_init',
324 * 'default_cleanup' here which are empty (but it is possible to have custom
325 * init/cleanup macros to be able to save/restore some extra NEON registers
326 * like d8-d15 or do anything else) followed by
327 * 'pixman_composite_over_8888_0565_process_pixblock_head',
328 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
329 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
330 * which we got implemented above.
332 * The last part is the NEON registers allocation scheme.
334 generate_composite_function \
335 pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
336 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
337 8, /* number of pixels, processed in a single block */ \
338 5, /* prefetch distance */ \
341 pixman_composite_over_8888_0565_process_pixblock_head, \
342 pixman_composite_over_8888_0565_process_pixblock_tail, \
343 pixman_composite_over_8888_0565_process_pixblock_tail_head, \
344 28, /* dst_w_basereg */ \
345 4, /* dst_r_basereg */ \
346 0, /* src_basereg */ \
347 24 /* mask_basereg */
349 /******************************************************************************/
351 .macro pixman_composite_over_n_0565_process_pixblock_head
352 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
353 and put data into d6 - red, d7 - green, d30 - blue */
359 vshrn.u16 d30, q2, #2
360 /* now do alpha blending, storing results in 8-bit planar format
361 into d16 - red, d19 - green, d18 - blue */
364 vmull.u8 q12, d3, d30
365 vrshr.u16 q13, q10, #8
366 vrshr.u16 q3, q11, #8
367 vrshr.u16 q15, q12, #8
368 vraddhn.u16 d20, q10, q13
369 vraddhn.u16 d23, q11, q3
370 vraddhn.u16 d22, q12, q15
373 .macro pixman_composite_over_n_0565_process_pixblock_tail
374 /* ... continue alpha blending */
375 vqadd.u8 d16, d2, d20
377 /* convert the result to r5g6b5 and store it into {d28, d29} */
378 vshll.u8 q14, d16, #8
382 vsri.u16 q14, q9, #11
385 /* TODO: expand macros and do better instructions scheduling */
386 .macro pixman_composite_over_n_0565_process_pixblock_tail_head
387 pixman_composite_over_n_0565_process_pixblock_tail
388 vld1.16 {d4, d5}, [DST_R, :128]!
389 vst1.16 {d28, d29}, [DST_W, :128]!
390 pixman_composite_over_n_0565_process_pixblock_head
393 .macro pixman_composite_over_n_0565_init
394 add DUMMY, sp, #ARGS_STACK_OFFSET
395 vld1.32 {d3[0]}, [DUMMY]
400 vmvn.8 d3, d3 /* invert source alpha */
403 generate_composite_function \
404 pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
405 FLAG_DST_READWRITE, \
406 8, /* number of pixels, processed in a single block */ \
407 5, /* prefetch distance */ \
408 pixman_composite_over_n_0565_init, \
410 pixman_composite_over_n_0565_process_pixblock_head, \
411 pixman_composite_over_n_0565_process_pixblock_tail, \
412 pixman_composite_over_n_0565_process_pixblock_tail_head, \
413 28, /* dst_w_basereg */ \
414 4, /* dst_r_basereg */ \
415 0, /* src_basereg */ \
416 24 /* mask_basereg */
418 /******************************************************************************/
420 .macro pixman_composite_src_8888_0565_process_pixblock_head
426 .macro pixman_composite_src_8888_0565_process_pixblock_tail
428 vsri.u16 q14, q9, #11
431 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
433 PF add PF_X, PF_X, #8
435 vld4.8 {d0, d1, d2, d3}, [SRC]!
436 PF addne PF_X, PF_X, #8
437 PF subne PF_CTL, PF_CTL, #1
438 vsri.u16 q14, q9, #11
440 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
442 vst1.16 {d28, d29}, [DST_W, :128]!
443 PF subge PF_X, PF_X, ORIG_W
444 PF subges PF_CTL, PF_CTL, #0x10
446 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
450 generate_composite_function \
451 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
452 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
453 8, /* number of pixels, processed in a single block */ \
454 10, /* prefetch distance */ \
457 pixman_composite_src_8888_0565_process_pixblock_head, \
458 pixman_composite_src_8888_0565_process_pixblock_tail, \
459 pixman_composite_src_8888_0565_process_pixblock_tail_head
461 /******************************************************************************/
463 .macro pixman_composite_src_0565_8888_process_pixblock_head
464 vshrn.u16 d30, q0, #8
465 vshrn.u16 d29, q0, #3
470 vshrn.u16 d28, q0, #2
473 .macro pixman_composite_src_0565_8888_process_pixblock_tail
476 /* TODO: expand macros and do better instructions scheduling */
477 .macro pixman_composite_src_0565_8888_process_pixblock_tail_head
478 pixman_composite_src_0565_8888_process_pixblock_tail
479 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
480 vld1.16 {d0, d1}, [SRC]!
481 pixman_composite_src_0565_8888_process_pixblock_head
485 generate_composite_function \
486 pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
487 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
488 8, /* number of pixels, processed in a single block */ \
489 10, /* prefetch distance */ \
492 pixman_composite_src_0565_8888_process_pixblock_head, \
493 pixman_composite_src_0565_8888_process_pixblock_tail, \
494 pixman_composite_src_0565_8888_process_pixblock_tail_head
496 /******************************************************************************/
498 .macro pixman_composite_add_8000_8000_process_pixblock_head
503 .macro pixman_composite_add_8000_8000_process_pixblock_tail
506 .macro pixman_composite_add_8000_8000_process_pixblock_tail_head
507 vld1.8 {d0, d1, d2, d3}, [SRC]!
508 PF add PF_X, PF_X, #32
510 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
511 PF addne PF_X, PF_X, #32
512 PF subne PF_CTL, PF_CTL, #1
513 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
515 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
516 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
517 PF subge PF_X, PF_X, ORIG_W
518 PF subges PF_CTL, PF_CTL, #0x10
520 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
521 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
525 generate_composite_function \
526 pixman_composite_add_8000_8000_asm_neon, 8, 0, 8, \
527 FLAG_DST_READWRITE, \
528 32, /* number of pixels, processed in a single block */ \
529 10, /* prefetch distance */ \
532 pixman_composite_add_8000_8000_process_pixblock_head, \
533 pixman_composite_add_8000_8000_process_pixblock_tail, \
534 pixman_composite_add_8000_8000_process_pixblock_tail_head
536 /******************************************************************************/
538 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
539 vld1.8 {d0, d1, d2, d3}, [SRC]!
540 PF add PF_X, PF_X, #8
542 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
543 PF addne PF_X, PF_X, #8
544 PF subne PF_CTL, PF_CTL, #1
545 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
547 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
548 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
549 PF subge PF_X, PF_X, ORIG_W
550 PF subges PF_CTL, PF_CTL, #0x10
552 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
553 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
557 generate_composite_function \
558 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
559 FLAG_DST_READWRITE, \
560 8, /* number of pixels, processed in a single block */ \
561 10, /* prefetch distance */ \
564 pixman_composite_add_8000_8000_process_pixblock_head, \
565 pixman_composite_add_8000_8000_process_pixblock_tail, \
566 pixman_composite_add_8888_8888_process_pixblock_tail_head
568 generate_composite_function_single_scanline \
569 pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
570 FLAG_DST_READWRITE, \
571 8, /* number of pixels, processed in a single block */ \
574 pixman_composite_add_8000_8000_process_pixblock_head, \
575 pixman_composite_add_8000_8000_process_pixblock_tail, \
576 pixman_composite_add_8888_8888_process_pixblock_tail_head
578 /******************************************************************************/
580 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
581 vmvn.8 d24, d3 /* get inverted alpha */
582 /* do alpha blending */
585 vmull.u8 q10, d24, d6
586 vmull.u8 q11, d24, d7
589 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
590 vrshr.u16 q14, q8, #8
591 vrshr.u16 q15, q9, #8
592 vrshr.u16 q12, q10, #8
593 vrshr.u16 q13, q11, #8
594 vraddhn.u16 d28, q14, q8
595 vraddhn.u16 d29, q15, q9
596 vraddhn.u16 d30, q12, q10
597 vraddhn.u16 d31, q13, q11
600 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
601 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
602 vrshr.u16 q14, q8, #8
603 PF add PF_X, PF_X, #8
605 vrshr.u16 q15, q9, #8
606 vrshr.u16 q12, q10, #8
607 vrshr.u16 q13, q11, #8
608 PF addne PF_X, PF_X, #8
609 PF subne PF_CTL, PF_CTL, #1
610 vraddhn.u16 d28, q14, q8
611 vraddhn.u16 d29, q15, q9
613 vraddhn.u16 d30, q12, q10
614 vraddhn.u16 d31, q13, q11
615 vld4.8 {d0, d1, d2, d3}, [SRC]!
616 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
618 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
619 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
620 PF subge PF_X, PF_X, ORIG_W
622 PF subges PF_CTL, PF_CTL, #0x10
624 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
625 vmull.u8 q10, d22, d6
626 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
627 vmull.u8 q11, d22, d7
630 generate_composite_function_single_scanline \
631 pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
632 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
633 8, /* number of pixels, processed in a single block */ \
636 pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
637 pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
638 pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
640 /******************************************************************************/
642 .macro pixman_composite_over_8888_8888_process_pixblock_head
643 pixman_composite_out_reverse_8888_8888_process_pixblock_head
646 .macro pixman_composite_over_8888_8888_process_pixblock_tail
647 pixman_composite_out_reverse_8888_8888_process_pixblock_tail
648 vqadd.u8 q14, q0, q14
649 vqadd.u8 q15, q1, q15
652 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
653 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
654 vrshr.u16 q14, q8, #8
655 PF add PF_X, PF_X, #8
657 vrshr.u16 q15, q9, #8
658 vrshr.u16 q12, q10, #8
659 vrshr.u16 q13, q11, #8
660 PF addne PF_X, PF_X, #8
661 PF subne PF_CTL, PF_CTL, #1
662 vraddhn.u16 d28, q14, q8
663 vraddhn.u16 d29, q15, q9
665 vraddhn.u16 d30, q12, q10
666 vraddhn.u16 d31, q13, q11
667 vqadd.u8 q14, q0, q14
668 vqadd.u8 q15, q1, q15
669 vld4.8 {d0, d1, d2, d3}, [SRC]!
670 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
672 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
673 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
674 PF subge PF_X, PF_X, ORIG_W
676 PF subges PF_CTL, PF_CTL, #0x10
678 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
679 vmull.u8 q10, d22, d6
680 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
681 vmull.u8 q11, d22, d7
684 generate_composite_function \
685 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
686 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
687 8, /* number of pixels, processed in a single block */ \
688 5, /* prefetch distance */ \
691 pixman_composite_over_8888_8888_process_pixblock_head, \
692 pixman_composite_over_8888_8888_process_pixblock_tail, \
693 pixman_composite_over_8888_8888_process_pixblock_tail_head
695 generate_composite_function_single_scanline \
696 pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
697 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
698 8, /* number of pixels, processed in a single block */ \
701 pixman_composite_over_8888_8888_process_pixblock_head, \
702 pixman_composite_over_8888_8888_process_pixblock_tail, \
703 pixman_composite_over_8888_8888_process_pixblock_tail_head
705 /******************************************************************************/
707 /* TODO: expand macros and do better instructions scheduling */
708 .macro pixman_composite_over_n_8888_process_pixblock_tail_head
709 pixman_composite_over_8888_8888_process_pixblock_tail
710 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
711 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
712 pixman_composite_over_8888_8888_process_pixblock_head
715 .macro pixman_composite_over_n_8888_init
716 add DUMMY, sp, #ARGS_STACK_OFFSET
717 vld1.32 {d3[0]}, [DUMMY]
724 generate_composite_function \
725 pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
726 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
727 8, /* number of pixels, processed in a single block */ \
728 5, /* prefetch distance */ \
729 pixman_composite_over_n_8888_init, \
731 pixman_composite_over_8888_8888_process_pixblock_head, \
732 pixman_composite_over_8888_8888_process_pixblock_tail, \
733 pixman_composite_over_n_8888_process_pixblock_tail_head
735 /******************************************************************************/
737 .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
738 vrshr.u16 q14, q8, #8
739 PF add PF_X, PF_X, #8
741 vrshr.u16 q15, q9, #8
742 vrshr.u16 q12, q10, #8
743 vrshr.u16 q13, q11, #8
744 PF addne PF_X, PF_X, #8
745 PF subne PF_CTL, PF_CTL, #1
746 vraddhn.u16 d28, q14, q8
747 vraddhn.u16 d29, q15, q9
749 vraddhn.u16 d30, q12, q10
750 vraddhn.u16 d31, q13, q11
751 vqadd.u8 q14, q0, q14
752 vqadd.u8 q15, q1, q15
753 vld4.8 {d0, d1, d2, d3}, [DST_R, :128]!
755 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
756 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
757 PF subge PF_X, PF_X, ORIG_W
759 PF subges PF_CTL, PF_CTL, #0x10
761 vmull.u8 q10, d22, d6
762 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
763 vmull.u8 q11, d22, d7
766 .macro pixman_composite_over_reverse_n_8888_init
767 add DUMMY, sp, #ARGS_STACK_OFFSET
768 vld1.32 {d7[0]}, [DUMMY]
775 generate_composite_function \
776 pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
777 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
778 8, /* number of pixels, processed in a single block */ \
779 5, /* prefetch distance */ \
780 pixman_composite_over_reverse_n_8888_init, \
782 pixman_composite_over_8888_8888_process_pixblock_head, \
783 pixman_composite_over_8888_8888_process_pixblock_tail, \
784 pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
785 28, /* dst_w_basereg */ \
786 0, /* dst_r_basereg */ \
787 4, /* src_basereg */ \
788 24 /* mask_basereg */
790 /******************************************************************************/
792 .macro pixman_composite_over_n_8_0565_process_pixblock_head
796 vmull.u8 q6, d24, d10
797 vmull.u8 q7, d24, d11
798 vrshr.u16 q10, q0, #8
799 vrshr.u16 q11, q1, #8
800 vrshr.u16 q12, q6, #8
801 vrshr.u16 q13, q7, #8
802 vraddhn.u16 d0, q0, q10
803 vraddhn.u16 d1, q1, q11
804 vraddhn.u16 d2, q6, q12
805 vraddhn.u16 d3, q7, q13
813 vshrn.u16 d30, q2, #2
814 /* now do alpha blending */
817 vmull.u8 q12, d3, d30
818 vrshr.u16 q13, q10, #8
819 vrshr.u16 q3, q11, #8
820 vrshr.u16 q15, q12, #8
821 vraddhn.u16 d20, q10, q13
822 vraddhn.u16 d23, q11, q3
823 vraddhn.u16 d22, q12, q15
826 .macro pixman_composite_over_n_8_0565_process_pixblock_tail
827 vqadd.u8 d16, d2, d20
829 /* convert to r5g6b5 */
830 vshll.u8 q14, d16, #8
834 vsri.u16 q14, q9, #11
837 /* TODO: expand macros and do better instructions scheduling */
838 .macro pixman_composite_over_n_8_0565_process_pixblock_tail_head
839 pixman_composite_over_n_8_0565_process_pixblock_tail
840 vst1.16 {d28, d29}, [DST_W, :128]!
841 vld1.16 {d4, d5}, [DST_R, :128]!
842 vld1.8 {d24}, [MASK]!
844 pixman_composite_over_n_8_0565_process_pixblock_head
848 * This function needs a special initialization of solid mask.
849 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
850 * offset, split into color components and replicated in d8-d11
851 * registers. Additionally, this function needs all the NEON registers,
852 * so it has to save d8-d15 registers which are callee saved according
853 * to ABI. These registers are restored from 'cleanup' macro. All the
854 * other NEON registers are caller saved, so can be clobbered freely
855 * without introducing any problems.
857 .macro pixman_composite_over_n_8_0565_init
858 add DUMMY, sp, #ARGS_STACK_OFFSET
860 vld1.32 {d11[0]}, [DUMMY]
867 .macro pixman_composite_over_n_8_0565_cleanup
871 generate_composite_function \
872 pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
873 FLAG_DST_READWRITE, \
874 8, /* number of pixels, processed in a single block */ \
875 5, /* prefetch distance */ \
876 pixman_composite_over_n_8_0565_init, \
877 pixman_composite_over_n_8_0565_cleanup, \
878 pixman_composite_over_n_8_0565_process_pixblock_head, \
879 pixman_composite_over_n_8_0565_process_pixblock_tail, \
880 pixman_composite_over_n_8_0565_process_pixblock_tail_head
882 /******************************************************************************/
884 /* TODO: expand macros and do better instructions scheduling */
885 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
886 vld1.16 {d4, d5}, [DST_R, :128]!
887 pixman_composite_over_n_8_0565_process_pixblock_tail
888 vld4.8 {d8, d9, d10, d11}, [SRC]!
890 vld1.8 {d24}, [MASK]!
891 pixman_composite_over_n_8_0565_process_pixblock_head
892 vst1.16 {d28, d29}, [DST_W, :128]!
895 .macro pixman_composite_over_8888_8_0565_init
899 .macro pixman_composite_over_8888_8_0565_cleanup
903 generate_composite_function \
904 pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
905 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
906 8, /* number of pixels, processed in a single block */ \
907 5, /* prefetch distance */ \
908 pixman_composite_over_8888_8_0565_init, \
909 pixman_composite_over_8888_8_0565_cleanup, \
910 pixman_composite_over_n_8_0565_process_pixblock_head, \
911 pixman_composite_over_n_8_0565_process_pixblock_tail, \
912 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
913 28, /* dst_w_basereg */ \
914 4, /* dst_r_basereg */ \
915 8, /* src_basereg */ \
916 24 /* mask_basereg */
918 /******************************************************************************/
920 .macro pixman_composite_src_0565_0565_process_pixblock_head
923 .macro pixman_composite_src_0565_0565_process_pixblock_tail
926 .macro pixman_composite_src_0565_0565_process_pixblock_tail_head
927 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
928 vld1.16 {d0, d1, d2, d3}, [SRC]!
932 generate_composite_function \
933 pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
934 FLAG_DST_WRITEONLY, \
935 16, /* number of pixels, processed in a single block */ \
936 10, /* prefetch distance */ \
939 pixman_composite_src_0565_0565_process_pixblock_head, \
940 pixman_composite_src_0565_0565_process_pixblock_tail, \
941 pixman_composite_src_0565_0565_process_pixblock_tail_head, \
942 0, /* dst_w_basereg */ \
943 0, /* dst_r_basereg */ \
944 0, /* src_basereg */ \
947 /******************************************************************************/
949 .macro pixman_composite_src_n_8_process_pixblock_head
952 .macro pixman_composite_src_n_8_process_pixblock_tail
955 .macro pixman_composite_src_n_8_process_pixblock_tail_head
956 vst1.8 {d0, d1, d2, d3}, [DST_W, :128]!
959 .macro pixman_composite_src_n_8_init
960 add DUMMY, sp, #ARGS_STACK_OFFSET
961 vld1.32 {d0[0]}, [DUMMY]
969 .macro pixman_composite_src_n_8_cleanup
972 generate_composite_function \
973 pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
974 FLAG_DST_WRITEONLY, \
975 32, /* number of pixels, processed in a single block */ \
976 0, /* prefetch distance */ \
977 pixman_composite_src_n_8_init, \
978 pixman_composite_src_n_8_cleanup, \
979 pixman_composite_src_n_8_process_pixblock_head, \
980 pixman_composite_src_n_8_process_pixblock_tail, \
981 pixman_composite_src_n_8_process_pixblock_tail_head, \
982 0, /* dst_w_basereg */ \
983 0, /* dst_r_basereg */ \
984 0, /* src_basereg */ \
987 /******************************************************************************/
989 .macro pixman_composite_src_n_0565_process_pixblock_head
992 .macro pixman_composite_src_n_0565_process_pixblock_tail
995 .macro pixman_composite_src_n_0565_process_pixblock_tail_head
996 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
999 .macro pixman_composite_src_n_0565_init
1000 add DUMMY, sp, #ARGS_STACK_OFFSET
1001 vld1.32 {d0[0]}, [DUMMY]
1002 vsli.u64 d0, d0, #16
1003 vsli.u64 d0, d0, #32
1008 .macro pixman_composite_src_n_0565_cleanup
1011 generate_composite_function \
1012 pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
1013 FLAG_DST_WRITEONLY, \
1014 16, /* number of pixels, processed in a single block */ \
1015 0, /* prefetch distance */ \
1016 pixman_composite_src_n_0565_init, \
1017 pixman_composite_src_n_0565_cleanup, \
1018 pixman_composite_src_n_0565_process_pixblock_head, \
1019 pixman_composite_src_n_0565_process_pixblock_tail, \
1020 pixman_composite_src_n_0565_process_pixblock_tail_head, \
1021 0, /* dst_w_basereg */ \
1022 0, /* dst_r_basereg */ \
1023 0, /* src_basereg */ \
1024 0 /* mask_basereg */
1026 /******************************************************************************/
1028 .macro pixman_composite_src_n_8888_process_pixblock_head
1031 .macro pixman_composite_src_n_8888_process_pixblock_tail
1034 .macro pixman_composite_src_n_8888_process_pixblock_tail_head
1035 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1038 .macro pixman_composite_src_n_8888_init
1039 add DUMMY, sp, #ARGS_STACK_OFFSET
1040 vld1.32 {d0[0]}, [DUMMY]
1041 vsli.u64 d0, d0, #32
1046 .macro pixman_composite_src_n_8888_cleanup
1049 generate_composite_function \
1050 pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
1051 FLAG_DST_WRITEONLY, \
1052 8, /* number of pixels, processed in a single block */ \
1053 0, /* prefetch distance */ \
1054 pixman_composite_src_n_8888_init, \
1055 pixman_composite_src_n_8888_cleanup, \
1056 pixman_composite_src_n_8888_process_pixblock_head, \
1057 pixman_composite_src_n_8888_process_pixblock_tail, \
1058 pixman_composite_src_n_8888_process_pixblock_tail_head, \
1059 0, /* dst_w_basereg */ \
1060 0, /* dst_r_basereg */ \
1061 0, /* src_basereg */ \
1062 0 /* mask_basereg */
1064 /******************************************************************************/
1066 .macro pixman_composite_src_8888_8888_process_pixblock_head
1069 .macro pixman_composite_src_8888_8888_process_pixblock_tail
1072 .macro pixman_composite_src_8888_8888_process_pixblock_tail_head
1073 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1074 vld1.32 {d0, d1, d2, d3}, [SRC]!
1078 generate_composite_function \
1079 pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
1080 FLAG_DST_WRITEONLY, \
1081 8, /* number of pixels, processed in a single block */ \
1082 10, /* prefetch distance */ \
1085 pixman_composite_src_8888_8888_process_pixblock_head, \
1086 pixman_composite_src_8888_8888_process_pixblock_tail, \
1087 pixman_composite_src_8888_8888_process_pixblock_tail_head, \
1088 0, /* dst_w_basereg */ \
1089 0, /* dst_r_basereg */ \
1090 0, /* src_basereg */ \
1091 0 /* mask_basereg */
1093 /******************************************************************************/
1095 .macro pixman_composite_src_x888_8888_process_pixblock_head
1100 .macro pixman_composite_src_x888_8888_process_pixblock_tail
1103 .macro pixman_composite_src_x888_8888_process_pixblock_tail_head
1104 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1105 vld1.32 {d0, d1, d2, d3}, [SRC]!
1111 .macro pixman_composite_src_x888_8888_init
1113 vshl.u32 q2, q2, #24
1116 generate_composite_function \
1117 pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
1118 FLAG_DST_WRITEONLY, \
1119 8, /* number of pixels, processed in a single block */ \
1120 10, /* prefetch distance */ \
1121 pixman_composite_src_x888_8888_init, \
1123 pixman_composite_src_x888_8888_process_pixblock_head, \
1124 pixman_composite_src_x888_8888_process_pixblock_tail, \
1125 pixman_composite_src_x888_8888_process_pixblock_tail_head, \
1126 0, /* dst_w_basereg */ \
1127 0, /* dst_r_basereg */ \
1128 0, /* src_basereg */ \
1129 0 /* mask_basereg */
1131 /******************************************************************************/
1133 .macro pixman_composite_over_n_8_8888_process_pixblock_head
1134 /* expecting deinterleaved source data in {d8, d9, d10, d11} */
1135 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1136 /* and destination data in {d4, d5, d6, d7} */
1137 /* mask is in d24 (d25, d26, d27 are unused) */
1140 vmull.u8 q0, d24, d8
1141 vmull.u8 q1, d24, d9
1142 vmull.u8 q6, d24, d10
1143 vmull.u8 q7, d24, d11
1144 vrshr.u16 q10, q0, #8
1145 vrshr.u16 q11, q1, #8
1146 vrshr.u16 q12, q6, #8
1147 vrshr.u16 q13, q7, #8
1148 vraddhn.u16 d0, q0, q10
1149 vraddhn.u16 d1, q1, q11
1150 vraddhn.u16 d2, q6, q12
1151 vraddhn.u16 d3, q7, q13
1152 vmvn.8 d24, d3 /* get inverted alpha */
1153 /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */
1154 /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
1155 /* now do alpha blending */
1156 vmull.u8 q8, d24, d4
1157 vmull.u8 q9, d24, d5
1158 vmull.u8 q10, d24, d6
1159 vmull.u8 q11, d24, d7
1162 .macro pixman_composite_over_n_8_8888_process_pixblock_tail
1163 vrshr.u16 q14, q8, #8
1164 vrshr.u16 q15, q9, #8
1165 vrshr.u16 q12, q10, #8
1166 vrshr.u16 q13, q11, #8
1167 vraddhn.u16 d28, q14, q8
1168 vraddhn.u16 d29, q15, q9
1169 vraddhn.u16 d30, q12, q10
1170 vraddhn.u16 d31, q13, q11
1171 vqadd.u8 q14, q0, q14
1172 vqadd.u8 q15, q1, q15
1175 /* TODO: expand macros and do better instructions scheduling */
1176 .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
1177 pixman_composite_over_n_8_8888_process_pixblock_tail
1178 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1179 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1180 vld1.8 {d24}, [MASK]!
1182 pixman_composite_over_n_8_8888_process_pixblock_head
1185 .macro pixman_composite_over_n_8_8888_init
1186 add DUMMY, sp, #ARGS_STACK_OFFSET
1188 vld1.32 {d11[0]}, [DUMMY]
1195 .macro pixman_composite_over_n_8_8888_cleanup
1199 generate_composite_function \
1200 pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
1201 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1202 8, /* number of pixels, processed in a single block */ \
1203 5, /* prefetch distance */ \
1204 pixman_composite_over_n_8_8888_init, \
1205 pixman_composite_over_n_8_8888_cleanup, \
1206 pixman_composite_over_n_8_8888_process_pixblock_head, \
1207 pixman_composite_over_n_8_8888_process_pixblock_tail, \
1208 pixman_composite_over_n_8_8888_process_pixblock_tail_head
1210 /******************************************************************************/
1212 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1214 * 'combine_mask_ca' replacement
1216 * input: solid src (n) in {d8, d9, d10, d11}
1217 * dest in {d4, d5, d6, d7 }
1218 * mask in {d24, d25, d26, d27}
1219 * output: updated src in {d0, d1, d2, d3 }
1220 * updated mask in {d24, d25, d26, d3 }
1222 vmull.u8 q0, d24, d8
1223 vmull.u8 q1, d25, d9
1224 vmull.u8 q6, d26, d10
1225 vmull.u8 q7, d27, d11
1226 vmull.u8 q9, d11, d25
1227 vmull.u8 q12, d11, d24
1228 vmull.u8 q13, d11, d26
1229 vrshr.u16 q8, q0, #8
1230 vrshr.u16 q10, q1, #8
1231 vrshr.u16 q11, q6, #8
1232 vraddhn.u16 d0, q0, q8
1233 vraddhn.u16 d1, q1, q10
1234 vraddhn.u16 d2, q6, q11
1235 vrshr.u16 q11, q12, #8
1236 vrshr.u16 q8, q9, #8
1237 vrshr.u16 q6, q13, #8
1238 vrshr.u16 q10, q7, #8
1239 vraddhn.u16 d24, q12, q11
1240 vraddhn.u16 d25, q9, q8
1241 vraddhn.u16 d26, q13, q6
1242 vraddhn.u16 d3, q7, q10
1244 * 'combine_over_ca' replacement
1246 * output: updated dest in {d28, d29, d30, d31}
1250 vmull.u8 q8, d24, d4
1251 vmull.u8 q9, d25, d5
1254 vmull.u8 q10, d26, d6
1255 vmull.u8 q11, d27, d7
1258 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
1259 /* ... continue 'combine_over_ca' replacement */
1260 vrshr.u16 q14, q8, #8
1261 vrshr.u16 q15, q9, #8
1262 vrshr.u16 q6, q10, #8
1263 vrshr.u16 q7, q11, #8
1264 vraddhn.u16 d28, q14, q8
1265 vraddhn.u16 d29, q15, q9
1266 vraddhn.u16 d30, q6, q10
1267 vraddhn.u16 d31, q7, q11
1268 vqadd.u8 q14, q0, q14
1269 vqadd.u8 q15, q1, q15
1272 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1273 vrshr.u16 q14, q8, #8
1274 vrshr.u16 q15, q9, #8
1275 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1276 vrshr.u16 q6, q10, #8
1277 vrshr.u16 q7, q11, #8
1278 vraddhn.u16 d28, q14, q8
1279 vraddhn.u16 d29, q15, q9
1280 vraddhn.u16 d30, q6, q10
1281 vraddhn.u16 d31, q7, q11
1282 vld4.8 {d24, d25, d26, d27}, [MASK]!
1283 vqadd.u8 q14, q0, q14
1284 vqadd.u8 q15, q1, q15
1286 pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1287 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1290 .macro pixman_composite_over_n_8888_8888_ca_init
1291 add DUMMY, sp, #ARGS_STACK_OFFSET
1293 vld1.32 {d11[0]}, [DUMMY]
1300 .macro pixman_composite_over_n_8888_8888_ca_cleanup
1304 generate_composite_function \
1305 pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
1306 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1307 8, /* number of pixels, processed in a single block */ \
1308 5, /* prefetch distance */ \
1309 pixman_composite_over_n_8888_8888_ca_init, \
1310 pixman_composite_over_n_8888_8888_ca_cleanup, \
1311 pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
1312 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
1313 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1315 /******************************************************************************/
1317 .macro pixman_composite_add_n_8_8_process_pixblock_head
1318 /* expecting source data in {d8, d9, d10, d11} */
1319 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1320 /* and destination data in {d4, d5, d6, d7} */
1321 /* mask is in d24, d25, d26, d27 */
1322 vmull.u8 q0, d24, d11
1323 vmull.u8 q1, d25, d11
1324 vmull.u8 q6, d26, d11
1325 vmull.u8 q7, d27, d11
1326 vrshr.u16 q10, q0, #8
1327 vrshr.u16 q11, q1, #8
1328 vrshr.u16 q12, q6, #8
1329 vrshr.u16 q13, q7, #8
1330 vraddhn.u16 d0, q0, q10
1331 vraddhn.u16 d1, q1, q11
1332 vraddhn.u16 d2, q6, q12
1333 vraddhn.u16 d3, q7, q13
1334 vqadd.u8 q14, q0, q2
1335 vqadd.u8 q15, q1, q3
1338 .macro pixman_composite_add_n_8_8_process_pixblock_tail
1341 /* TODO: expand macros and do better instructions scheduling */
1342 .macro pixman_composite_add_n_8_8_process_pixblock_tail_head
1343 pixman_composite_add_n_8_8_process_pixblock_tail
1344 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1345 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1346 vld1.8 {d24, d25, d26, d27}, [MASK]!
1347 cache_preload 32, 32
1348 pixman_composite_add_n_8_8_process_pixblock_head
1351 .macro pixman_composite_add_n_8_8_init
1352 add DUMMY, sp, #ARGS_STACK_OFFSET
1354 vld1.32 {d11[0]}, [DUMMY]
1358 .macro pixman_composite_add_n_8_8_cleanup
1362 generate_composite_function \
1363 pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
1364 FLAG_DST_READWRITE, \
1365 32, /* number of pixels, processed in a single block */ \
1366 5, /* prefetch distance */ \
1367 pixman_composite_add_n_8_8_init, \
1368 pixman_composite_add_n_8_8_cleanup, \
1369 pixman_composite_add_n_8_8_process_pixblock_head, \
1370 pixman_composite_add_n_8_8_process_pixblock_tail, \
1371 pixman_composite_add_n_8_8_process_pixblock_tail_head
1373 /******************************************************************************/
1375 .macro pixman_composite_add_8_8_8_process_pixblock_head
1376 /* expecting source data in {d0, d1, d2, d3} */
1377 /* destination data in {d4, d5, d6, d7} */
1378 /* mask in {d24, d25, d26, d27} */
1379 vmull.u8 q8, d24, d0
1380 vmull.u8 q9, d25, d1
1381 vmull.u8 q10, d26, d2
1382 vmull.u8 q11, d27, d3
1383 vrshr.u16 q0, q8, #8
1384 vrshr.u16 q1, q9, #8
1385 vrshr.u16 q12, q10, #8
1386 vrshr.u16 q13, q11, #8
1387 vraddhn.u16 d0, q0, q8
1388 vraddhn.u16 d1, q1, q9
1389 vraddhn.u16 d2, q12, q10
1390 vraddhn.u16 d3, q13, q11
1391 vqadd.u8 q14, q0, q2
1392 vqadd.u8 q15, q1, q3
1395 .macro pixman_composite_add_8_8_8_process_pixblock_tail
1398 /* TODO: expand macros and do better instructions scheduling */
1399 .macro pixman_composite_add_8_8_8_process_pixblock_tail_head
1400 pixman_composite_add_8_8_8_process_pixblock_tail
1401 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1402 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1403 vld1.8 {d24, d25, d26, d27}, [MASK]!
1404 vld1.8 {d0, d1, d2, d3}, [SRC]!
1405 cache_preload 32, 32
1406 pixman_composite_add_8_8_8_process_pixblock_head
1409 .macro pixman_composite_add_8_8_8_init
1412 .macro pixman_composite_add_8_8_8_cleanup
1415 generate_composite_function \
1416 pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
1417 FLAG_DST_READWRITE, \
1418 32, /* number of pixels, processed in a single block */ \
1419 5, /* prefetch distance */ \
1420 pixman_composite_add_8_8_8_init, \
1421 pixman_composite_add_8_8_8_cleanup, \
1422 pixman_composite_add_8_8_8_process_pixblock_head, \
1423 pixman_composite_add_8_8_8_process_pixblock_tail, \
1424 pixman_composite_add_8_8_8_process_pixblock_tail_head
1426 /******************************************************************************/
1428 .macro pixman_composite_add_8888_8888_8888_process_pixblock_head
1429 /* expecting source data in {d0, d1, d2, d3} */
1430 /* destination data in {d4, d5, d6, d7} */
1431 /* mask in {d24, d25, d26, d27} */
1432 vmull.u8 q8, d27, d0
1433 vmull.u8 q9, d27, d1
1434 vmull.u8 q10, d27, d2
1435 vmull.u8 q11, d27, d3
1436 vrshr.u16 q0, q8, #8
1437 vrshr.u16 q1, q9, #8
1438 vrshr.u16 q12, q10, #8
1439 vrshr.u16 q13, q11, #8
1440 vraddhn.u16 d0, q0, q8
1441 vraddhn.u16 d1, q1, q9
1442 vraddhn.u16 d2, q12, q10
1443 vraddhn.u16 d3, q13, q11
1444 vqadd.u8 q14, q0, q2
1445 vqadd.u8 q15, q1, q3
1448 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
1451 /* TODO: expand macros and do better instructions scheduling */
1452 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1453 pixman_composite_add_8888_8888_8888_process_pixblock_tail
1454 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1455 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1456 vld4.8 {d24, d25, d26, d27}, [MASK]!
1457 vld4.8 {d0, d1, d2, d3}, [SRC]!
1459 pixman_composite_add_8888_8888_8888_process_pixblock_head
1462 generate_composite_function \
1463 pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
1464 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1465 8, /* number of pixels, processed in a single block */ \
1466 10, /* prefetch distance */ \
1469 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1470 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1471 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1473 generate_composite_function_single_scanline \
1474 pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
1475 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1476 8, /* number of pixels, processed in a single block */ \
1479 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1480 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1481 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1483 /******************************************************************************/
1485 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
1486 /* expecting source data in {d0, d1, d2, d3} */
1487 /* destination data in {d4, d5, d6, d7} */
1488 /* solid mask is in d15 */
1491 vmull.u8 q8, d15, d3
1492 vmull.u8 q6, d15, d2
1493 vmull.u8 q5, d15, d1
1494 vmull.u8 q4, d15, d0
1495 vrshr.u16 q13, q8, #8
1496 vrshr.u16 q12, q6, #8
1497 vrshr.u16 q11, q5, #8
1498 vrshr.u16 q10, q4, #8
1499 vraddhn.u16 d3, q8, q13
1500 vraddhn.u16 d2, q6, q12
1501 vraddhn.u16 d1, q5, q11
1502 vraddhn.u16 d0, q4, q10
1503 vmvn.8 d24, d3 /* get inverted alpha */
1504 /* now do alpha blending */
1505 vmull.u8 q8, d24, d4
1506 vmull.u8 q9, d24, d5
1507 vmull.u8 q10, d24, d6
1508 vmull.u8 q11, d24, d7
1511 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
1512 vrshr.u16 q14, q8, #8
1513 vrshr.u16 q15, q9, #8
1514 vrshr.u16 q12, q10, #8
1515 vrshr.u16 q13, q11, #8
1516 vraddhn.u16 d28, q14, q8
1517 vraddhn.u16 d29, q15, q9
1518 vraddhn.u16 d30, q12, q10
1519 vraddhn.u16 d31, q13, q11
1522 .macro pixman_composite_out_reverse_8888_8888_8888_init
1526 .macro pixman_composite_out_reverse_8888_8888_8888_cleanup
1530 /* TODO: expand macros and do better instructions scheduling */
1531 .macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
1532 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1533 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
1534 vld4.8 {d0, d1, d2, d3}, [SRC]!
1536 vld4.8 {d12, d13, d14, d15}, [MASK]!
1537 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
1538 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1541 generate_composite_function_single_scanline \
1542 pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
1543 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1544 8, /* number of pixels, processed in a single block */ \
1545 pixman_composite_out_reverse_8888_8888_8888_init, \
1546 pixman_composite_out_reverse_8888_8888_8888_cleanup, \
1547 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
1548 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
1549 pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
1550 28, /* dst_w_basereg */ \
1551 4, /* dst_r_basereg */ \
1552 0, /* src_basereg */ \
1553 12 /* mask_basereg */
1555 /******************************************************************************/
1557 .macro pixman_composite_over_8888_n_8888_process_pixblock_head
1558 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
1561 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail
1562 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
1563 vqadd.u8 q14, q0, q14
1564 vqadd.u8 q15, q1, q15
1567 /* TODO: expand macros and do better instructions scheduling */
1568 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
1569 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1570 pixman_composite_over_8888_n_8888_process_pixblock_tail
1571 vld4.8 {d0, d1, d2, d3}, [SRC]!
1573 pixman_composite_over_8888_n_8888_process_pixblock_head
1574 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1577 .macro pixman_composite_over_8888_n_8888_init
1580 vld1.32 {d15[0]}, [DUMMY]
1584 .macro pixman_composite_over_8888_n_8888_cleanup
1588 generate_composite_function \
1589 pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
1590 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1591 8, /* number of pixels, processed in a single block */ \
1592 5, /* prefetch distance */ \
1593 pixman_composite_over_8888_n_8888_init, \
1594 pixman_composite_over_8888_n_8888_cleanup, \
1595 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1596 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1597 pixman_composite_over_8888_n_8888_process_pixblock_tail_head
1599 /******************************************************************************/
1601 /* TODO: expand macros and do better instructions scheduling */
1602 .macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
1603 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1604 pixman_composite_over_8888_n_8888_process_pixblock_tail
1605 vld4.8 {d0, d1, d2, d3}, [SRC]!
1607 vld4.8 {d12, d13, d14, d15}, [MASK]!
1608 pixman_composite_over_8888_n_8888_process_pixblock_head
1609 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1612 .macro pixman_composite_over_8888_8888_8888_init
1616 .macro pixman_composite_over_8888_8888_8888_cleanup
1620 generate_composite_function \
1621 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
1622 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1623 8, /* number of pixels, processed in a single block */ \
1624 5, /* prefetch distance */ \
1625 pixman_composite_over_8888_8888_8888_init, \
1626 pixman_composite_over_8888_8888_8888_cleanup, \
1627 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1628 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1629 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
1630 28, /* dst_w_basereg */ \
1631 4, /* dst_r_basereg */ \
1632 0, /* src_basereg */ \
1633 12 /* mask_basereg */
1635 generate_composite_function_single_scanline \
1636 pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
1637 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1638 8, /* number of pixels, processed in a single block */ \
1639 pixman_composite_over_8888_8888_8888_init, \
1640 pixman_composite_over_8888_8888_8888_cleanup, \
1641 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1642 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1643 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
1644 28, /* dst_w_basereg */ \
1645 4, /* dst_r_basereg */ \
1646 0, /* src_basereg */ \
1647 12 /* mask_basereg */
1649 /******************************************************************************/
1651 /* TODO: expand macros and do better instructions scheduling */
1652 .macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
1653 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1654 pixman_composite_over_8888_n_8888_process_pixblock_tail
1655 vld4.8 {d0, d1, d2, d3}, [SRC]!
1657 vld1.8 {d15}, [MASK]!
1658 pixman_composite_over_8888_n_8888_process_pixblock_head
1659 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1662 .macro pixman_composite_over_8888_8_8888_init
1666 .macro pixman_composite_over_8888_8_8888_cleanup
1670 generate_composite_function \
1671 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
1672 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1673 8, /* number of pixels, processed in a single block */ \
1674 5, /* prefetch distance */ \
1675 pixman_composite_over_8888_8_8888_init, \
1676 pixman_composite_over_8888_8_8888_cleanup, \
1677 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1678 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1679 pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
1680 28, /* dst_w_basereg */ \
1681 4, /* dst_r_basereg */ \
1682 0, /* src_basereg */ \
1683 15 /* mask_basereg */
1685 /******************************************************************************/
1687 .macro pixman_composite_src_0888_0888_process_pixblock_head
1690 .macro pixman_composite_src_0888_0888_process_pixblock_tail
1693 .macro pixman_composite_src_0888_0888_process_pixblock_tail_head
1694 vst3.8 {d0, d1, d2}, [DST_W]!
1695 vld3.8 {d0, d1, d2}, [SRC]!
1699 generate_composite_function \
1700 pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
1701 FLAG_DST_WRITEONLY, \
1702 8, /* number of pixels, processed in a single block */ \
1703 10, /* prefetch distance */ \
1706 pixman_composite_src_0888_0888_process_pixblock_head, \
1707 pixman_composite_src_0888_0888_process_pixblock_tail, \
1708 pixman_composite_src_0888_0888_process_pixblock_tail_head, \
1709 0, /* dst_w_basereg */ \
1710 0, /* dst_r_basereg */ \
1711 0, /* src_basereg */ \
1712 0 /* mask_basereg */
1714 /******************************************************************************/
1716 .macro pixman_composite_src_0888_8888_rev_process_pixblock_head
1720 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
1723 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
1724 vst4.8 {d0, d1, d2, d3}, [DST_W]!
1725 vld3.8 {d0, d1, d2}, [SRC]!
1730 .macro pixman_composite_src_0888_8888_rev_init
1734 generate_composite_function \
1735 pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
1736 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1737 8, /* number of pixels, processed in a single block */ \
1738 10, /* prefetch distance */ \
1739 pixman_composite_src_0888_8888_rev_init, \
1741 pixman_composite_src_0888_8888_rev_process_pixblock_head, \
1742 pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
1743 pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
1744 0, /* dst_w_basereg */ \
1745 0, /* dst_r_basereg */ \
1746 0, /* src_basereg */ \
1747 0 /* mask_basereg */
1749 /******************************************************************************/
1751 .macro pixman_composite_src_0888_0565_rev_process_pixblock_head
1756 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
1757 vshll.u8 q14, d0, #8
1758 vsri.u16 q14, q8, #5
1759 vsri.u16 q14, q9, #11
1762 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
1763 vshll.u8 q14, d0, #8
1764 vld3.8 {d0, d1, d2}, [SRC]!
1765 vsri.u16 q14, q8, #5
1766 vsri.u16 q14, q9, #11
1768 vst1.16 {d28, d29}, [DST_W, :128]!
1772 generate_composite_function \
1773 pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
1774 FLAG_DST_WRITEONLY, \
1775 8, /* number of pixels, processed in a single block */ \
1776 10, /* prefetch distance */ \
1779 pixman_composite_src_0888_0565_rev_process_pixblock_head, \
1780 pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
1781 pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
1782 28, /* dst_w_basereg */ \
1783 0, /* dst_r_basereg */ \
1784 0, /* src_basereg */ \
1785 0 /* mask_basereg */
1787 /******************************************************************************/
1789 .macro pixman_composite_src_pixbuf_8888_process_pixblock_head
1792 vmull.u8 q10, d3, d2
1795 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
1796 vrshr.u16 q11, q8, #8
1798 vrshr.u16 q12, q9, #8
1799 vrshr.u16 q13, q10, #8
1800 vraddhn.u16 d30, q11, q8
1801 vraddhn.u16 d29, q12, q9
1802 vraddhn.u16 d28, q13, q10
1805 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
1806 vrshr.u16 q11, q8, #8
1808 vrshr.u16 q12, q9, #8
1809 vrshr.u16 q13, q10, #8
1810 vld4.8 {d0, d1, d2, d3}, [SRC]!
1811 vraddhn.u16 d30, q11, q8
1812 PF add PF_X, PF_X, #8
1814 PF addne PF_X, PF_X, #8
1815 PF subne PF_CTL, PF_CTL, #1
1816 vraddhn.u16 d29, q12, q9
1817 vraddhn.u16 d28, q13, q10
1820 vmull.u8 q10, d3, d2
1821 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1823 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
1824 PF subge PF_X, PF_X, ORIG_W
1825 PF subges PF_CTL, PF_CTL, #0x10
1826 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1829 generate_composite_function \
1830 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
1831 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1832 8, /* number of pixels, processed in a single block */ \
1833 10, /* prefetch distance */ \
1836 pixman_composite_src_pixbuf_8888_process_pixblock_head, \
1837 pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
1838 pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
1839 28, /* dst_w_basereg */ \
1840 0, /* dst_r_basereg */ \
1841 0, /* src_basereg */ \
1842 0 /* mask_basereg */