2 * Copyright © 2009 Nokia Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
27 * This file contains implementations of NEON optimized pixel processing
28 * functions. There is no full and detailed tutorial, but some functions
29 * (those which are exposing some new or interesting features) are
30 * extensively commented and can be used as examples.
32 * You may want to have a look at the comments for following functions:
33 * - pixman_composite_over_8888_0565_asm_neon
34 * - pixman_composite_over_n_8_0565_asm_neon
37 /* Prevent the stack from becoming executable for no reason... */
38 #if defined(__linux__) && defined(__ELF__)
39 .section .note.GNU-stack,"",%progbits
46 .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
47 .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
51 #include "pixman-arm-neon-asm.h"
53 /* Global configuration options and preferences */
56 * The code can optionally make use of unaligned memory accesses to improve
57 * performance of handling leading/trailing pixels for each scanline.
58 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
59 * example in linux if unaligned memory accesses are not configured to
60 * generate.exceptions.
62 .set RESPECT_STRICT_ALIGNMENT, 1
65 * Set default prefetch type. There is a choice between the following options:
67 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
68 * as NOP to workaround some HW bugs or for whatever other reason)
70 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
71 * advanced prefetch intruduces heavy overhead)
73 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
74 * which can run ARM and NEON instructions simultaneously so that extra ARM
75 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
77 * Note: some types of function can't support advanced prefetch and fallback
78 * to simple one (those which handle 24bpp pixels)
80 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
82 /* Prefetch distance in pixels for simple prefetch */
83 .set PREFETCH_DISTANCE_SIMPLE, 64
86 * Implementation of pixman_composite_over_8888_0565_asm_neon
88 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
89 * performs OVER compositing operation. Function fast_composite_over_8888_0565
90 * from pixman-fast-path.c does the same in C and can be used as a reference.
92 * First we need to have some NEON assembly code which can do the actual
93 * operation on the pixels and provide it to the template macro.
95 * Template macro quite conveniently takes care of emitting all the necessary
96 * code for memory reading and writing (including quite tricky cases of
97 * handling unaligned leading/trailing pixels), so we only need to deal with
98 * the data in NEON registers.
100 * NEON registers allocation in general is recommented to be the following:
101 * d0, d1, d2, d3 - contain loaded source pixel data
102 * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed)
103 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
104 * d28, d29, d30, d31 - place for storing the result (destination pixels)
106 * As can be seen above, four 64-bit NEON registers are used for keeping
107 * intermediate pixel data and up to 8 pixels can be processed in one step
108 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
110 * This particular function uses the following registers allocation:
111 * d0, d1, d2, d3 - contain loaded source pixel data
112 * d4, d5 - contain loaded destination pixels (they are needed)
113 * d28, d29 - place for storing the result (destination pixels)
117 * Step one. We need to have some code to do some arithmetics on pixel data.
118 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
119 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
120 * perform all the needed calculations and write the result to {d28, d29}.
121 * The rationale for having two macros and not just one will be explained
122 * later. In practice, any single monolitic function which does the work can
123 * be split into two parts in any arbitrary way without affecting correctness.
125 * There is one special trick here too. Common template macro can optionally
126 * make our life a bit easier by doing R, G, B, A color components
127 * deinterleaving for 32bpp pixel formats (and this feature is used in
128 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
129 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
130 * actually use d0 register for blue channel (a vector of eight 8-bit
131 * values), d1 register for green, d2 for red and d3 for alpha. This
132 * simple conversion can be also done with a few NEON instructions:
134 * Packed to planar conversion:
140 * Planar to packed conversion:
146 * But pixel can be loaded directly in planar format using VLD4.8 NEON
147 * instruction. It is 1 cycle slower than VLD1.32, so this is not always
148 * desirable, that's why deinterleaving is optional.
150 * But anyway, here is the code:
152 .macro pixman_composite_over_8888_0565_process_pixblock_head
153 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
154 and put data into d6 - red, d7 - green, d30 - blue */
159 vmvn.8 d3, d3 /* invert source alpha */
161 vshrn.u16 d30, q2, #2
162 /* now do alpha blending, storing results in 8-bit planar format
163 into d16 - red, d19 - green, d18 - blue */
166 vmull.u8 q12, d3, d30
167 vrshr.u16 q13, q10, #8
168 vrshr.u16 q3, q11, #8
169 vrshr.u16 q15, q12, #8
170 vraddhn.u16 d20, q10, q13
171 vraddhn.u16 d23, q11, q3
172 vraddhn.u16 d22, q12, q15
175 .macro pixman_composite_over_8888_0565_process_pixblock_tail
176 /* ... continue alpha blending */
177 vqadd.u8 d16, d2, d20
179 /* convert the result to r5g6b5 and store it into {d28, d29} */
180 vshll.u8 q14, d16, #8
184 vsri.u16 q14, q9, #11
188 * OK, now we got almost everything that we need. Using the above two
189 * macros, the work can be done right. But now we want to optimize
190 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
191 * a lot from good code scheduling and software pipelining.
193 * Let's construct some code, which will run in the core main loop.
194 * Some pseudo-code of the main loop will look like this:
202 * It may look a bit weird, but this setup allows to hide instruction
203 * latencies better and also utilize dual-issue capability more
204 * efficiently (make pairs of load-store and ALU instructions).
206 * So what we need now is a '*_tail_head' macro, which will be used
207 * in the core main loop. A trivial straightforward implementation
208 * of this macro would look like this:
210 * pixman_composite_over_8888_0565_process_pixblock_tail
211 * vst1.16 {d28, d29}, [DST_W, :128]!
212 * vld1.16 {d4, d5}, [DST_R, :128]!
213 * vld4.32 {d0, d1, d2, d3}, [SRC]!
214 * pixman_composite_over_8888_0565_process_pixblock_head
217 * Now it also got some VLD/VST instructions. We simply can't move from
218 * processing one block of pixels to the other one with just arithmetics.
219 * The previously processed data needs to be written to memory and new
220 * data needs to be fetched. Fortunately, this main loop does not deal
221 * with partial leading/trailing pixels and can load/store a full block
222 * of pixels in a bulk. Additionally, destination buffer is already
223 * 16 bytes aligned here (which is good for performance).
225 * New things here are DST_R, DST_W, SRC and MASK identifiers. These
226 * are the aliases for ARM registers which are used as pointers for
227 * accessing data. We maintain separate pointers for reading and writing
228 * destination buffer (DST_R and DST_W).
230 * Another new thing is 'cache_preload' macro. It is used for prefetching
231 * data into CPU L2 cache and improve performance when dealing with large
232 * images which are far larger than cache size. It uses one argument
233 * (actually two, but they need to be the same here) - number of pixels
234 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
235 * details about this macro. Moreover, if good performance is needed
236 * the code from this macro needs to be copied into '*_tail_head' macro
237 * and mixed with the rest of code for optimal instructions scheduling.
238 * We are actually doing it below.
240 * Now after all the explanations, here is the optimized code.
241 * Different instruction streams (originaling from '*_head', '*_tail'
242 * and 'cache_preload' macro) use different indentation levels for
243 * better readability. Actually taking the code from one of these
244 * indentation levels and ignoring a few VLD/VST instructions would
245 * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
251 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
252 vqadd.u8 d16, d2, d20
253 vld1.16 {d4, d5}, [DST_R, :128]!
256 vld4.8 {d0, d1, d2, d3}, [SRC]!
259 vshll.u8 q14, d16, #8
260 PF add PF_X, PF_X, #8
264 PF addne PF_X, PF_X, #8
266 PF subne PF_CTL, PF_CTL, #1
268 vshrn.u16 d30, q2, #2
270 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
272 vmull.u8 q12, d3, d30
273 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
277 vrshr.u16 q13, q10, #8
278 PF subge PF_X, PF_X, ORIG_W
279 vrshr.u16 q3, q11, #8
280 vrshr.u16 q15, q12, #8
281 PF subges PF_CTL, PF_CTL, #0x10
282 vsri.u16 q14, q9, #11
283 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
284 vraddhn.u16 d20, q10, q13
285 vraddhn.u16 d23, q11, q3
286 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
287 vraddhn.u16 d22, q12, q15
288 vst1.16 {d28, d29}, [DST_W, :128]!
293 /* If we did not care much about the performance, we would just use this... */
294 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
295 pixman_composite_over_8888_0565_process_pixblock_tail
296 vst1.16 {d28, d29}, [DST_W, :128]!
297 vld1.16 {d4, d5}, [DST_R, :128]!
298 vld4.32 {d0, d1, d2, d3}, [SRC]!
299 pixman_composite_over_8888_0565_process_pixblock_head
306 * And now the final part. We are using 'generate_composite_function' macro
307 * to put all the stuff together. We are specifying the name of the function
308 * which we want to get, number of bits per pixel for the source, mask and
309 * destination (0 if unused, like mask in this case). Next come some bit
311 * FLAG_DST_READWRITE - tells that the destination buffer is both read
312 * and written, for write-only buffer we would use
313 * FLAG_DST_WRITEONLY flag instead
314 * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
315 * and separate color channels for 32bpp format.
316 * The next things are:
317 * - the number of pixels processed per iteration (8 in this case, because
318 * that's the maximum what can fit into four 64-bit NEON registers).
319 * - prefetch distance, measured in pixel blocks. In this case it is 5 times
320 * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
321 * prefetch distance can be selected by running some benchmarks.
323 * After that we specify some macros, these are 'default_init',
324 * 'default_cleanup' here which are empty (but it is possible to have custom
325 * init/cleanup macros to be able to save/restore some extra NEON registers
326 * like d8-d15 or do anything else) followed by
327 * 'pixman_composite_over_8888_0565_process_pixblock_head',
328 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
329 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
330 * which we got implemented above.
332 * The last part is the NEON registers allocation scheme.
334 generate_composite_function \
335 pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
336 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
337 8, /* number of pixels, processed in a single block */ \
338 5, /* prefetch distance */ \
341 pixman_composite_over_8888_0565_process_pixblock_head, \
342 pixman_composite_over_8888_0565_process_pixblock_tail, \
343 pixman_composite_over_8888_0565_process_pixblock_tail_head, \
344 28, /* dst_w_basereg */ \
345 4, /* dst_r_basereg */ \
346 0, /* src_basereg */ \
347 24 /* mask_basereg */
349 /******************************************************************************/
351 .macro pixman_composite_over_n_0565_process_pixblock_head
352 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
353 and put data into d6 - red, d7 - green, d30 - blue */
359 vshrn.u16 d30, q2, #2
360 /* now do alpha blending, storing results in 8-bit planar format
361 into d16 - red, d19 - green, d18 - blue */
364 vmull.u8 q12, d3, d30
365 vrshr.u16 q13, q10, #8
366 vrshr.u16 q3, q11, #8
367 vrshr.u16 q15, q12, #8
368 vraddhn.u16 d20, q10, q13
369 vraddhn.u16 d23, q11, q3
370 vraddhn.u16 d22, q12, q15
373 .macro pixman_composite_over_n_0565_process_pixblock_tail
374 /* ... continue alpha blending */
375 vqadd.u8 d16, d2, d20
377 /* convert the result to r5g6b5 and store it into {d28, d29} */
378 vshll.u8 q14, d16, #8
382 vsri.u16 q14, q9, #11
385 /* TODO: expand macros and do better instructions scheduling */
386 .macro pixman_composite_over_n_0565_process_pixblock_tail_head
387 pixman_composite_over_n_0565_process_pixblock_tail
388 vld1.16 {d4, d5}, [DST_R, :128]!
389 vst1.16 {d28, d29}, [DST_W, :128]!
390 pixman_composite_over_n_0565_process_pixblock_head
394 .macro pixman_composite_over_n_0565_init
395 add DUMMY, sp, #ARGS_STACK_OFFSET
396 vld1.32 {d3[0]}, [DUMMY]
401 vmvn.8 d3, d3 /* invert source alpha */
404 generate_composite_function \
405 pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
406 FLAG_DST_READWRITE, \
407 8, /* number of pixels, processed in a single block */ \
408 5, /* prefetch distance */ \
409 pixman_composite_over_n_0565_init, \
411 pixman_composite_over_n_0565_process_pixblock_head, \
412 pixman_composite_over_n_0565_process_pixblock_tail, \
413 pixman_composite_over_n_0565_process_pixblock_tail_head, \
414 28, /* dst_w_basereg */ \
415 4, /* dst_r_basereg */ \
416 0, /* src_basereg */ \
417 24 /* mask_basereg */
419 /******************************************************************************/
421 .macro pixman_composite_src_8888_0565_process_pixblock_head
427 .macro pixman_composite_src_8888_0565_process_pixblock_tail
429 vsri.u16 q14, q9, #11
432 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
434 PF add PF_X, PF_X, #8
436 vld4.8 {d0, d1, d2, d3}, [SRC]!
437 PF addne PF_X, PF_X, #8
438 PF subne PF_CTL, PF_CTL, #1
439 vsri.u16 q14, q9, #11
441 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
443 vst1.16 {d28, d29}, [DST_W, :128]!
444 PF subge PF_X, PF_X, ORIG_W
445 PF subges PF_CTL, PF_CTL, #0x10
447 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
451 generate_composite_function \
452 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
453 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
454 8, /* number of pixels, processed in a single block */ \
455 10, /* prefetch distance */ \
458 pixman_composite_src_8888_0565_process_pixblock_head, \
459 pixman_composite_src_8888_0565_process_pixblock_tail, \
460 pixman_composite_src_8888_0565_process_pixblock_tail_head
462 /******************************************************************************/
464 .macro pixman_composite_src_0565_8888_process_pixblock_head
465 vshrn.u16 d30, q0, #8
466 vshrn.u16 d29, q0, #3
471 vshrn.u16 d28, q0, #2
474 .macro pixman_composite_src_0565_8888_process_pixblock_tail
477 /* TODO: expand macros and do better instructions scheduling */
478 .macro pixman_composite_src_0565_8888_process_pixblock_tail_head
479 pixman_composite_src_0565_8888_process_pixblock_tail
480 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
481 vld1.16 {d0, d1}, [SRC]!
482 pixman_composite_src_0565_8888_process_pixblock_head
486 generate_composite_function \
487 pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
488 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
489 8, /* number of pixels, processed in a single block */ \
490 10, /* prefetch distance */ \
493 pixman_composite_src_0565_8888_process_pixblock_head, \
494 pixman_composite_src_0565_8888_process_pixblock_tail, \
495 pixman_composite_src_0565_8888_process_pixblock_tail_head
497 /******************************************************************************/
499 .macro pixman_composite_add_8_8_process_pixblock_head
504 .macro pixman_composite_add_8_8_process_pixblock_tail
507 .macro pixman_composite_add_8_8_process_pixblock_tail_head
508 vld1.8 {d0, d1, d2, d3}, [SRC]!
509 PF add PF_X, PF_X, #32
511 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
512 PF addne PF_X, PF_X, #32
513 PF subne PF_CTL, PF_CTL, #1
514 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
516 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
517 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
518 PF subge PF_X, PF_X, ORIG_W
519 PF subges PF_CTL, PF_CTL, #0x10
521 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
522 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
526 generate_composite_function \
527 pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
528 FLAG_DST_READWRITE, \
529 32, /* number of pixels, processed in a single block */ \
530 10, /* prefetch distance */ \
533 pixman_composite_add_8_8_process_pixblock_head, \
534 pixman_composite_add_8_8_process_pixblock_tail, \
535 pixman_composite_add_8_8_process_pixblock_tail_head
537 /******************************************************************************/
539 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
540 vld1.8 {d0, d1, d2, d3}, [SRC]!
541 PF add PF_X, PF_X, #8
543 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
544 PF addne PF_X, PF_X, #8
545 PF subne PF_CTL, PF_CTL, #1
546 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
548 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
549 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
550 PF subge PF_X, PF_X, ORIG_W
551 PF subges PF_CTL, PF_CTL, #0x10
553 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
554 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
558 generate_composite_function \
559 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
560 FLAG_DST_READWRITE, \
561 8, /* number of pixels, processed in a single block */ \
562 10, /* prefetch distance */ \
565 pixman_composite_add_8_8_process_pixblock_head, \
566 pixman_composite_add_8_8_process_pixblock_tail, \
567 pixman_composite_add_8888_8888_process_pixblock_tail_head
569 generate_composite_function_single_scanline \
570 pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
571 FLAG_DST_READWRITE, \
572 8, /* number of pixels, processed in a single block */ \
575 pixman_composite_add_8_8_process_pixblock_head, \
576 pixman_composite_add_8_8_process_pixblock_tail, \
577 pixman_composite_add_8888_8888_process_pixblock_tail_head
579 /******************************************************************************/
581 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
582 vmvn.8 d24, d3 /* get inverted alpha */
583 /* do alpha blending */
586 vmull.u8 q10, d24, d6
587 vmull.u8 q11, d24, d7
590 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
591 vrshr.u16 q14, q8, #8
592 vrshr.u16 q15, q9, #8
593 vrshr.u16 q12, q10, #8
594 vrshr.u16 q13, q11, #8
595 vraddhn.u16 d28, q14, q8
596 vraddhn.u16 d29, q15, q9
597 vraddhn.u16 d30, q12, q10
598 vraddhn.u16 d31, q13, q11
601 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
602 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
603 vrshr.u16 q14, q8, #8
604 PF add PF_X, PF_X, #8
606 vrshr.u16 q15, q9, #8
607 vrshr.u16 q12, q10, #8
608 vrshr.u16 q13, q11, #8
609 PF addne PF_X, PF_X, #8
610 PF subne PF_CTL, PF_CTL, #1
611 vraddhn.u16 d28, q14, q8
612 vraddhn.u16 d29, q15, q9
614 vraddhn.u16 d30, q12, q10
615 vraddhn.u16 d31, q13, q11
616 vld4.8 {d0, d1, d2, d3}, [SRC]!
617 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
619 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
620 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
621 PF subge PF_X, PF_X, ORIG_W
623 PF subges PF_CTL, PF_CTL, #0x10
625 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
626 vmull.u8 q10, d22, d6
627 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
628 vmull.u8 q11, d22, d7
631 generate_composite_function_single_scanline \
632 pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
633 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
634 8, /* number of pixels, processed in a single block */ \
637 pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
638 pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
639 pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
641 /******************************************************************************/
643 .macro pixman_composite_over_8888_8888_process_pixblock_head
644 pixman_composite_out_reverse_8888_8888_process_pixblock_head
647 .macro pixman_composite_over_8888_8888_process_pixblock_tail
648 pixman_composite_out_reverse_8888_8888_process_pixblock_tail
649 vqadd.u8 q14, q0, q14
650 vqadd.u8 q15, q1, q15
653 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
654 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
655 vrshr.u16 q14, q8, #8
656 PF add PF_X, PF_X, #8
658 vrshr.u16 q15, q9, #8
659 vrshr.u16 q12, q10, #8
660 vrshr.u16 q13, q11, #8
661 PF addne PF_X, PF_X, #8
662 PF subne PF_CTL, PF_CTL, #1
663 vraddhn.u16 d28, q14, q8
664 vraddhn.u16 d29, q15, q9
666 vraddhn.u16 d30, q12, q10
667 vraddhn.u16 d31, q13, q11
668 vqadd.u8 q14, q0, q14
669 vqadd.u8 q15, q1, q15
670 vld4.8 {d0, d1, d2, d3}, [SRC]!
671 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
673 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
674 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
675 PF subge PF_X, PF_X, ORIG_W
677 PF subges PF_CTL, PF_CTL, #0x10
679 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
680 vmull.u8 q10, d22, d6
681 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
682 vmull.u8 q11, d22, d7
685 generate_composite_function \
686 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
687 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
688 8, /* number of pixels, processed in a single block */ \
689 5, /* prefetch distance */ \
692 pixman_composite_over_8888_8888_process_pixblock_head, \
693 pixman_composite_over_8888_8888_process_pixblock_tail, \
694 pixman_composite_over_8888_8888_process_pixblock_tail_head
696 generate_composite_function_single_scanline \
697 pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
698 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
699 8, /* number of pixels, processed in a single block */ \
702 pixman_composite_over_8888_8888_process_pixblock_head, \
703 pixman_composite_over_8888_8888_process_pixblock_tail, \
704 pixman_composite_over_8888_8888_process_pixblock_tail_head
706 /******************************************************************************/
708 /* TODO: expand macros and do better instructions scheduling */
709 .macro pixman_composite_over_n_8888_process_pixblock_tail_head
710 pixman_composite_over_8888_8888_process_pixblock_tail
711 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
712 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
713 pixman_composite_over_8888_8888_process_pixblock_head
717 .macro pixman_composite_over_n_8888_init
718 add DUMMY, sp, #ARGS_STACK_OFFSET
719 vld1.32 {d3[0]}, [DUMMY]
726 generate_composite_function \
727 pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
728 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
729 8, /* number of pixels, processed in a single block */ \
730 5, /* prefetch distance */ \
731 pixman_composite_over_n_8888_init, \
733 pixman_composite_over_8888_8888_process_pixblock_head, \
734 pixman_composite_over_8888_8888_process_pixblock_tail, \
735 pixman_composite_over_n_8888_process_pixblock_tail_head
737 /******************************************************************************/
739 .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
740 vrshr.u16 q14, q8, #8
741 PF add PF_X, PF_X, #8
743 vrshr.u16 q15, q9, #8
744 vrshr.u16 q12, q10, #8
745 vrshr.u16 q13, q11, #8
746 PF addne PF_X, PF_X, #8
747 PF subne PF_CTL, PF_CTL, #1
748 vraddhn.u16 d28, q14, q8
749 vraddhn.u16 d29, q15, q9
751 vraddhn.u16 d30, q12, q10
752 vraddhn.u16 d31, q13, q11
753 vqadd.u8 q14, q0, q14
754 vqadd.u8 q15, q1, q15
755 vld4.8 {d0, d1, d2, d3}, [DST_R, :128]!
757 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
758 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
759 PF subge PF_X, PF_X, ORIG_W
761 PF subges PF_CTL, PF_CTL, #0x10
763 vmull.u8 q10, d22, d6
764 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
765 vmull.u8 q11, d22, d7
768 .macro pixman_composite_over_reverse_n_8888_init
769 add DUMMY, sp, #ARGS_STACK_OFFSET
770 vld1.32 {d7[0]}, [DUMMY]
777 generate_composite_function \
778 pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
779 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
780 8, /* number of pixels, processed in a single block */ \
781 5, /* prefetch distance */ \
782 pixman_composite_over_reverse_n_8888_init, \
784 pixman_composite_over_8888_8888_process_pixblock_head, \
785 pixman_composite_over_8888_8888_process_pixblock_tail, \
786 pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
787 28, /* dst_w_basereg */ \
788 0, /* dst_r_basereg */ \
789 4, /* src_basereg */ \
790 24 /* mask_basereg */
792 /******************************************************************************/
794 .macro pixman_composite_over_n_8_0565_process_pixblock_head
798 vmull.u8 q6, d24, d10
799 vmull.u8 q7, d24, d11
800 vrshr.u16 q10, q0, #8
801 vrshr.u16 q11, q1, #8
802 vrshr.u16 q12, q6, #8
803 vrshr.u16 q13, q7, #8
804 vraddhn.u16 d0, q0, q10
805 vraddhn.u16 d1, q1, q11
806 vraddhn.u16 d2, q6, q12
807 vraddhn.u16 d3, q7, q13
815 vshrn.u16 d30, q2, #2
816 /* now do alpha blending */
819 vmull.u8 q12, d3, d30
820 vrshr.u16 q13, q10, #8
821 vrshr.u16 q3, q11, #8
822 vrshr.u16 q15, q12, #8
823 vraddhn.u16 d20, q10, q13
824 vraddhn.u16 d23, q11, q3
825 vraddhn.u16 d22, q12, q15
828 .macro pixman_composite_over_n_8_0565_process_pixblock_tail
829 vqadd.u8 d16, d2, d20
831 /* convert to r5g6b5 */
832 vshll.u8 q14, d16, #8
836 vsri.u16 q14, q9, #11
839 /* TODO: expand macros and do better instructions scheduling */
840 .macro pixman_composite_over_n_8_0565_process_pixblock_tail_head
841 pixman_composite_over_n_8_0565_process_pixblock_tail
842 vst1.16 {d28, d29}, [DST_W, :128]!
843 vld1.16 {d4, d5}, [DST_R, :128]!
844 vld1.8 {d24}, [MASK]!
846 pixman_composite_over_n_8_0565_process_pixblock_head
850 * This function needs a special initialization of solid mask.
851 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
852 * offset, split into color components and replicated in d8-d11
853 * registers. Additionally, this function needs all the NEON registers,
854 * so it has to save d8-d15 registers which are callee saved according
855 * to ABI. These registers are restored from 'cleanup' macro. All the
856 * other NEON registers are caller saved, so can be clobbered freely
857 * without introducing any problems.
859 .macro pixman_composite_over_n_8_0565_init
860 add DUMMY, sp, #ARGS_STACK_OFFSET
862 vld1.32 {d11[0]}, [DUMMY]
869 .macro pixman_composite_over_n_8_0565_cleanup
873 generate_composite_function \
874 pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
875 FLAG_DST_READWRITE, \
876 8, /* number of pixels, processed in a single block */ \
877 5, /* prefetch distance */ \
878 pixman_composite_over_n_8_0565_init, \
879 pixman_composite_over_n_8_0565_cleanup, \
880 pixman_composite_over_n_8_0565_process_pixblock_head, \
881 pixman_composite_over_n_8_0565_process_pixblock_tail, \
882 pixman_composite_over_n_8_0565_process_pixblock_tail_head
884 /******************************************************************************/
886 /* TODO: expand macros and do better instructions scheduling */
887 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
888 vld1.16 {d4, d5}, [DST_R, :128]!
889 pixman_composite_over_n_8_0565_process_pixblock_tail
890 vld4.8 {d8, d9, d10, d11}, [SRC]!
892 vld1.8 {d24}, [MASK]!
893 pixman_composite_over_n_8_0565_process_pixblock_head
894 vst1.16 {d28, d29}, [DST_W, :128]!
897 generate_composite_function \
898 pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
899 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
900 8, /* number of pixels, processed in a single block */ \
901 5, /* prefetch distance */ \
902 default_init_need_all_regs, \
903 default_cleanup_need_all_regs, \
904 pixman_composite_over_n_8_0565_process_pixblock_head, \
905 pixman_composite_over_n_8_0565_process_pixblock_tail, \
906 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
907 28, /* dst_w_basereg */ \
908 4, /* dst_r_basereg */ \
909 8, /* src_basereg */ \
910 24 /* mask_basereg */
912 /******************************************************************************/
914 .macro pixman_composite_src_0565_0565_process_pixblock_head
917 .macro pixman_composite_src_0565_0565_process_pixblock_tail
920 .macro pixman_composite_src_0565_0565_process_pixblock_tail_head
921 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
922 vld1.16 {d0, d1, d2, d3}, [SRC]!
926 generate_composite_function \
927 pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
928 FLAG_DST_WRITEONLY, \
929 16, /* number of pixels, processed in a single block */ \
930 10, /* prefetch distance */ \
933 pixman_composite_src_0565_0565_process_pixblock_head, \
934 pixman_composite_src_0565_0565_process_pixblock_tail, \
935 pixman_composite_src_0565_0565_process_pixblock_tail_head, \
936 0, /* dst_w_basereg */ \
937 0, /* dst_r_basereg */ \
938 0, /* src_basereg */ \
941 /******************************************************************************/
943 .macro pixman_composite_src_n_8_process_pixblock_head
946 .macro pixman_composite_src_n_8_process_pixblock_tail
949 .macro pixman_composite_src_n_8_process_pixblock_tail_head
950 vst1.8 {d0, d1, d2, d3}, [DST_W, :128]!
953 .macro pixman_composite_src_n_8_init
954 add DUMMY, sp, #ARGS_STACK_OFFSET
955 vld1.32 {d0[0]}, [DUMMY]
963 .macro pixman_composite_src_n_8_cleanup
966 generate_composite_function \
967 pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
968 FLAG_DST_WRITEONLY, \
969 32, /* number of pixels, processed in a single block */ \
970 0, /* prefetch distance */ \
971 pixman_composite_src_n_8_init, \
972 pixman_composite_src_n_8_cleanup, \
973 pixman_composite_src_n_8_process_pixblock_head, \
974 pixman_composite_src_n_8_process_pixblock_tail, \
975 pixman_composite_src_n_8_process_pixblock_tail_head, \
976 0, /* dst_w_basereg */ \
977 0, /* dst_r_basereg */ \
978 0, /* src_basereg */ \
981 /******************************************************************************/
983 .macro pixman_composite_src_n_0565_process_pixblock_head
986 .macro pixman_composite_src_n_0565_process_pixblock_tail
989 .macro pixman_composite_src_n_0565_process_pixblock_tail_head
990 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
993 .macro pixman_composite_src_n_0565_init
994 add DUMMY, sp, #ARGS_STACK_OFFSET
995 vld1.32 {d0[0]}, [DUMMY]
1002 .macro pixman_composite_src_n_0565_cleanup
1005 generate_composite_function \
1006 pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
1007 FLAG_DST_WRITEONLY, \
1008 16, /* number of pixels, processed in a single block */ \
1009 0, /* prefetch distance */ \
1010 pixman_composite_src_n_0565_init, \
1011 pixman_composite_src_n_0565_cleanup, \
1012 pixman_composite_src_n_0565_process_pixblock_head, \
1013 pixman_composite_src_n_0565_process_pixblock_tail, \
1014 pixman_composite_src_n_0565_process_pixblock_tail_head, \
1015 0, /* dst_w_basereg */ \
1016 0, /* dst_r_basereg */ \
1017 0, /* src_basereg */ \
1018 0 /* mask_basereg */
1020 /******************************************************************************/
1022 .macro pixman_composite_src_n_8888_process_pixblock_head
1025 .macro pixman_composite_src_n_8888_process_pixblock_tail
1028 .macro pixman_composite_src_n_8888_process_pixblock_tail_head
1029 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1032 .macro pixman_composite_src_n_8888_init
1033 add DUMMY, sp, #ARGS_STACK_OFFSET
1034 vld1.32 {d0[0]}, [DUMMY]
1035 vsli.u64 d0, d0, #32
1040 .macro pixman_composite_src_n_8888_cleanup
1043 generate_composite_function \
1044 pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
1045 FLAG_DST_WRITEONLY, \
1046 8, /* number of pixels, processed in a single block */ \
1047 0, /* prefetch distance */ \
1048 pixman_composite_src_n_8888_init, \
1049 pixman_composite_src_n_8888_cleanup, \
1050 pixman_composite_src_n_8888_process_pixblock_head, \
1051 pixman_composite_src_n_8888_process_pixblock_tail, \
1052 pixman_composite_src_n_8888_process_pixblock_tail_head, \
1053 0, /* dst_w_basereg */ \
1054 0, /* dst_r_basereg */ \
1055 0, /* src_basereg */ \
1056 0 /* mask_basereg */
1058 /******************************************************************************/
1060 .macro pixman_composite_src_8888_8888_process_pixblock_head
1063 .macro pixman_composite_src_8888_8888_process_pixblock_tail
1066 .macro pixman_composite_src_8888_8888_process_pixblock_tail_head
1067 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1068 vld1.32 {d0, d1, d2, d3}, [SRC]!
1072 generate_composite_function \
1073 pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
1074 FLAG_DST_WRITEONLY, \
1075 8, /* number of pixels, processed in a single block */ \
1076 10, /* prefetch distance */ \
1079 pixman_composite_src_8888_8888_process_pixblock_head, \
1080 pixman_composite_src_8888_8888_process_pixblock_tail, \
1081 pixman_composite_src_8888_8888_process_pixblock_tail_head, \
1082 0, /* dst_w_basereg */ \
1083 0, /* dst_r_basereg */ \
1084 0, /* src_basereg */ \
1085 0 /* mask_basereg */
1087 /******************************************************************************/
1089 .macro pixman_composite_src_x888_8888_process_pixblock_head
1094 .macro pixman_composite_src_x888_8888_process_pixblock_tail
1097 .macro pixman_composite_src_x888_8888_process_pixblock_tail_head
1098 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1099 vld1.32 {d0, d1, d2, d3}, [SRC]!
1105 .macro pixman_composite_src_x888_8888_init
1107 vshl.u32 q2, q2, #24
1110 generate_composite_function \
1111 pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
1112 FLAG_DST_WRITEONLY, \
1113 8, /* number of pixels, processed in a single block */ \
1114 10, /* prefetch distance */ \
1115 pixman_composite_src_x888_8888_init, \
1117 pixman_composite_src_x888_8888_process_pixblock_head, \
1118 pixman_composite_src_x888_8888_process_pixblock_tail, \
1119 pixman_composite_src_x888_8888_process_pixblock_tail_head, \
1120 0, /* dst_w_basereg */ \
1121 0, /* dst_r_basereg */ \
1122 0, /* src_basereg */ \
1123 0 /* mask_basereg */
1125 /******************************************************************************/
1127 .macro pixman_composite_over_n_8_8888_process_pixblock_head
1128 /* expecting deinterleaved source data in {d8, d9, d10, d11} */
1129 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1130 /* and destination data in {d4, d5, d6, d7} */
1131 /* mask is in d24 (d25, d26, d27 are unused) */
1134 vmull.u8 q0, d24, d8
1135 vmull.u8 q1, d24, d9
1136 vmull.u8 q6, d24, d10
1137 vmull.u8 q7, d24, d11
1138 vrshr.u16 q10, q0, #8
1139 vrshr.u16 q11, q1, #8
1140 vrshr.u16 q12, q6, #8
1141 vrshr.u16 q13, q7, #8
1142 vraddhn.u16 d0, q0, q10
1143 vraddhn.u16 d1, q1, q11
1144 vraddhn.u16 d2, q6, q12
1145 vraddhn.u16 d3, q7, q13
1146 vmvn.8 d24, d3 /* get inverted alpha */
1147 /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */
1148 /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
1149 /* now do alpha blending */
1150 vmull.u8 q8, d24, d4
1151 vmull.u8 q9, d24, d5
1152 vmull.u8 q10, d24, d6
1153 vmull.u8 q11, d24, d7
1156 .macro pixman_composite_over_n_8_8888_process_pixblock_tail
1157 vrshr.u16 q14, q8, #8
1158 vrshr.u16 q15, q9, #8
1159 vrshr.u16 q12, q10, #8
1160 vrshr.u16 q13, q11, #8
1161 vraddhn.u16 d28, q14, q8
1162 vraddhn.u16 d29, q15, q9
1163 vraddhn.u16 d30, q12, q10
1164 vraddhn.u16 d31, q13, q11
1165 vqadd.u8 q14, q0, q14
1166 vqadd.u8 q15, q1, q15
1169 /* TODO: expand macros and do better instructions scheduling */
1170 .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
1171 pixman_composite_over_n_8_8888_process_pixblock_tail
1172 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1173 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1174 vld1.8 {d24}, [MASK]!
1176 pixman_composite_over_n_8_8888_process_pixblock_head
1179 .macro pixman_composite_over_n_8_8888_init
1180 add DUMMY, sp, #ARGS_STACK_OFFSET
1182 vld1.32 {d11[0]}, [DUMMY]
1189 .macro pixman_composite_over_n_8_8888_cleanup
1193 generate_composite_function \
1194 pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
1195 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1196 8, /* number of pixels, processed in a single block */ \
1197 5, /* prefetch distance */ \
1198 pixman_composite_over_n_8_8888_init, \
1199 pixman_composite_over_n_8_8888_cleanup, \
1200 pixman_composite_over_n_8_8888_process_pixblock_head, \
1201 pixman_composite_over_n_8_8888_process_pixblock_tail, \
1202 pixman_composite_over_n_8_8888_process_pixblock_tail_head
1204 /******************************************************************************/
1206 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1208 * 'combine_mask_ca' replacement
1210 * input: solid src (n) in {d8, d9, d10, d11}
1211 * dest in {d4, d5, d6, d7 }
1212 * mask in {d24, d25, d26, d27}
1213 * output: updated src in {d0, d1, d2, d3 }
1214 * updated mask in {d24, d25, d26, d3 }
1216 vmull.u8 q0, d24, d8
1217 vmull.u8 q1, d25, d9
1218 vmull.u8 q6, d26, d10
1219 vmull.u8 q7, d27, d11
1220 vmull.u8 q9, d11, d25
1221 vmull.u8 q12, d11, d24
1222 vmull.u8 q13, d11, d26
1223 vrshr.u16 q8, q0, #8
1224 vrshr.u16 q10, q1, #8
1225 vrshr.u16 q11, q6, #8
1226 vraddhn.u16 d0, q0, q8
1227 vraddhn.u16 d1, q1, q10
1228 vraddhn.u16 d2, q6, q11
1229 vrshr.u16 q11, q12, #8
1230 vrshr.u16 q8, q9, #8
1231 vrshr.u16 q6, q13, #8
1232 vrshr.u16 q10, q7, #8
1233 vraddhn.u16 d24, q12, q11
1234 vraddhn.u16 d25, q9, q8
1235 vraddhn.u16 d26, q13, q6
1236 vraddhn.u16 d3, q7, q10
1238 * 'combine_over_ca' replacement
1240 * output: updated dest in {d28, d29, d30, d31}
1244 vmull.u8 q8, d24, d4
1245 vmull.u8 q9, d25, d5
1248 vmull.u8 q10, d26, d6
1249 vmull.u8 q11, d27, d7
1252 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
1253 /* ... continue 'combine_over_ca' replacement */
1254 vrshr.u16 q14, q8, #8
1255 vrshr.u16 q15, q9, #8
1256 vrshr.u16 q6, q10, #8
1257 vrshr.u16 q7, q11, #8
1258 vraddhn.u16 d28, q14, q8
1259 vraddhn.u16 d29, q15, q9
1260 vraddhn.u16 d30, q6, q10
1261 vraddhn.u16 d31, q7, q11
1262 vqadd.u8 q14, q0, q14
1263 vqadd.u8 q15, q1, q15
1266 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1267 vrshr.u16 q14, q8, #8
1268 vrshr.u16 q15, q9, #8
1269 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1270 vrshr.u16 q6, q10, #8
1271 vrshr.u16 q7, q11, #8
1272 vraddhn.u16 d28, q14, q8
1273 vraddhn.u16 d29, q15, q9
1274 vraddhn.u16 d30, q6, q10
1275 vraddhn.u16 d31, q7, q11
1276 vld4.8 {d24, d25, d26, d27}, [MASK]!
1277 vqadd.u8 q14, q0, q14
1278 vqadd.u8 q15, q1, q15
1280 pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1281 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1284 .macro pixman_composite_over_n_8888_8888_ca_init
1285 add DUMMY, sp, #ARGS_STACK_OFFSET
1287 vld1.32 {d11[0]}, [DUMMY]
1294 .macro pixman_composite_over_n_8888_8888_ca_cleanup
1298 generate_composite_function \
1299 pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
1300 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1301 8, /* number of pixels, processed in a single block */ \
1302 5, /* prefetch distance */ \
1303 pixman_composite_over_n_8888_8888_ca_init, \
1304 pixman_composite_over_n_8888_8888_ca_cleanup, \
1305 pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
1306 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
1307 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1309 /******************************************************************************/
1311 .macro pixman_composite_add_n_8_8_process_pixblock_head
1312 /* expecting source data in {d8, d9, d10, d11} */
1313 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1314 /* and destination data in {d4, d5, d6, d7} */
1315 /* mask is in d24, d25, d26, d27 */
1316 vmull.u8 q0, d24, d11
1317 vmull.u8 q1, d25, d11
1318 vmull.u8 q6, d26, d11
1319 vmull.u8 q7, d27, d11
1320 vrshr.u16 q10, q0, #8
1321 vrshr.u16 q11, q1, #8
1322 vrshr.u16 q12, q6, #8
1323 vrshr.u16 q13, q7, #8
1324 vraddhn.u16 d0, q0, q10
1325 vraddhn.u16 d1, q1, q11
1326 vraddhn.u16 d2, q6, q12
1327 vraddhn.u16 d3, q7, q13
1328 vqadd.u8 q14, q0, q2
1329 vqadd.u8 q15, q1, q3
1332 .macro pixman_composite_add_n_8_8_process_pixblock_tail
1335 /* TODO: expand macros and do better instructions scheduling */
1336 .macro pixman_composite_add_n_8_8_process_pixblock_tail_head
1337 pixman_composite_add_n_8_8_process_pixblock_tail
1338 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1339 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1340 vld1.8 {d24, d25, d26, d27}, [MASK]!
1341 cache_preload 32, 32
1342 pixman_composite_add_n_8_8_process_pixblock_head
1345 .macro pixman_composite_add_n_8_8_init
1346 add DUMMY, sp, #ARGS_STACK_OFFSET
1348 vld1.32 {d11[0]}, [DUMMY]
1352 .macro pixman_composite_add_n_8_8_cleanup
1356 generate_composite_function \
1357 pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
1358 FLAG_DST_READWRITE, \
1359 32, /* number of pixels, processed in a single block */ \
1360 5, /* prefetch distance */ \
1361 pixman_composite_add_n_8_8_init, \
1362 pixman_composite_add_n_8_8_cleanup, \
1363 pixman_composite_add_n_8_8_process_pixblock_head, \
1364 pixman_composite_add_n_8_8_process_pixblock_tail, \
1365 pixman_composite_add_n_8_8_process_pixblock_tail_head
1367 /******************************************************************************/
1369 .macro pixman_composite_add_8_8_8_process_pixblock_head
1370 /* expecting source data in {d0, d1, d2, d3} */
1371 /* destination data in {d4, d5, d6, d7} */
1372 /* mask in {d24, d25, d26, d27} */
1373 vmull.u8 q8, d24, d0
1374 vmull.u8 q9, d25, d1
1375 vmull.u8 q10, d26, d2
1376 vmull.u8 q11, d27, d3
1377 vrshr.u16 q0, q8, #8
1378 vrshr.u16 q1, q9, #8
1379 vrshr.u16 q12, q10, #8
1380 vrshr.u16 q13, q11, #8
1381 vraddhn.u16 d0, q0, q8
1382 vraddhn.u16 d1, q1, q9
1383 vraddhn.u16 d2, q12, q10
1384 vraddhn.u16 d3, q13, q11
1385 vqadd.u8 q14, q0, q2
1386 vqadd.u8 q15, q1, q3
1389 .macro pixman_composite_add_8_8_8_process_pixblock_tail
1392 /* TODO: expand macros and do better instructions scheduling */
1393 .macro pixman_composite_add_8_8_8_process_pixblock_tail_head
1394 pixman_composite_add_8_8_8_process_pixblock_tail
1395 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1396 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1397 vld1.8 {d24, d25, d26, d27}, [MASK]!
1398 vld1.8 {d0, d1, d2, d3}, [SRC]!
1399 cache_preload 32, 32
1400 pixman_composite_add_8_8_8_process_pixblock_head
1403 .macro pixman_composite_add_8_8_8_init
1406 .macro pixman_composite_add_8_8_8_cleanup
1409 generate_composite_function \
1410 pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
1411 FLAG_DST_READWRITE, \
1412 32, /* number of pixels, processed in a single block */ \
1413 5, /* prefetch distance */ \
1414 pixman_composite_add_8_8_8_init, \
1415 pixman_composite_add_8_8_8_cleanup, \
1416 pixman_composite_add_8_8_8_process_pixblock_head, \
1417 pixman_composite_add_8_8_8_process_pixblock_tail, \
1418 pixman_composite_add_8_8_8_process_pixblock_tail_head
1420 /******************************************************************************/
1422 .macro pixman_composite_add_8888_8888_8888_process_pixblock_head
1423 /* expecting source data in {d0, d1, d2, d3} */
1424 /* destination data in {d4, d5, d6, d7} */
1425 /* mask in {d24, d25, d26, d27} */
1426 vmull.u8 q8, d27, d0
1427 vmull.u8 q9, d27, d1
1428 vmull.u8 q10, d27, d2
1429 vmull.u8 q11, d27, d3
1430 vrshr.u16 q0, q8, #8
1431 vrshr.u16 q1, q9, #8
1432 vrshr.u16 q12, q10, #8
1433 vrshr.u16 q13, q11, #8
1434 vraddhn.u16 d0, q0, q8
1435 vraddhn.u16 d1, q1, q9
1436 vraddhn.u16 d2, q12, q10
1437 vraddhn.u16 d3, q13, q11
1438 vqadd.u8 q14, q0, q2
1439 vqadd.u8 q15, q1, q3
1442 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
1445 /* TODO: expand macros and do better instructions scheduling */
1446 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1447 pixman_composite_add_8888_8888_8888_process_pixblock_tail
1448 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1449 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1450 vld4.8 {d24, d25, d26, d27}, [MASK]!
1451 vld4.8 {d0, d1, d2, d3}, [SRC]!
1453 pixman_composite_add_8888_8888_8888_process_pixblock_head
1456 generate_composite_function \
1457 pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
1458 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1459 8, /* number of pixels, processed in a single block */ \
1460 10, /* prefetch distance */ \
1463 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1464 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1465 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1467 generate_composite_function_single_scanline \
1468 pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
1469 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1470 8, /* number of pixels, processed in a single block */ \
1473 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1474 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1475 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1477 /******************************************************************************/
1479 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
1480 /* expecting source data in {d0, d1, d2, d3} */
1481 /* destination data in {d4, d5, d6, d7} */
1482 /* solid mask is in d15 */
1485 vmull.u8 q8, d15, d3
1486 vmull.u8 q6, d15, d2
1487 vmull.u8 q5, d15, d1
1488 vmull.u8 q4, d15, d0
1489 vrshr.u16 q13, q8, #8
1490 vrshr.u16 q12, q6, #8
1491 vrshr.u16 q11, q5, #8
1492 vrshr.u16 q10, q4, #8
1493 vraddhn.u16 d3, q8, q13
1494 vraddhn.u16 d2, q6, q12
1495 vraddhn.u16 d1, q5, q11
1496 vraddhn.u16 d0, q4, q10
1497 vmvn.8 d24, d3 /* get inverted alpha */
1498 /* now do alpha blending */
1499 vmull.u8 q8, d24, d4
1500 vmull.u8 q9, d24, d5
1501 vmull.u8 q10, d24, d6
1502 vmull.u8 q11, d24, d7
1505 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
1506 vrshr.u16 q14, q8, #8
1507 vrshr.u16 q15, q9, #8
1508 vrshr.u16 q12, q10, #8
1509 vrshr.u16 q13, q11, #8
1510 vraddhn.u16 d28, q14, q8
1511 vraddhn.u16 d29, q15, q9
1512 vraddhn.u16 d30, q12, q10
1513 vraddhn.u16 d31, q13, q11
1516 /* TODO: expand macros and do better instructions scheduling */
1517 .macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
1518 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1519 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
1520 vld4.8 {d0, d1, d2, d3}, [SRC]!
1522 vld4.8 {d12, d13, d14, d15}, [MASK]!
1523 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
1524 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1527 generate_composite_function_single_scanline \
1528 pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
1529 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1530 8, /* number of pixels, processed in a single block */ \
1531 default_init_need_all_regs, \
1532 default_cleanup_need_all_regs, \
1533 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
1534 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
1535 pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
1536 28, /* dst_w_basereg */ \
1537 4, /* dst_r_basereg */ \
1538 0, /* src_basereg */ \
1539 12 /* mask_basereg */
1541 /******************************************************************************/
1543 .macro pixman_composite_over_8888_n_8888_process_pixblock_head
1544 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
1547 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail
1548 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
1549 vqadd.u8 q14, q0, q14
1550 vqadd.u8 q15, q1, q15
1553 /* TODO: expand macros and do better instructions scheduling */
1554 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
1555 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1556 pixman_composite_over_8888_n_8888_process_pixblock_tail
1557 vld4.8 {d0, d1, d2, d3}, [SRC]!
1559 pixman_composite_over_8888_n_8888_process_pixblock_head
1560 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1563 .macro pixman_composite_over_8888_n_8888_init
1566 vld1.32 {d15[0]}, [DUMMY]
1570 .macro pixman_composite_over_8888_n_8888_cleanup
1574 generate_composite_function \
1575 pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
1576 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1577 8, /* number of pixels, processed in a single block */ \
1578 5, /* prefetch distance */ \
1579 pixman_composite_over_8888_n_8888_init, \
1580 pixman_composite_over_8888_n_8888_cleanup, \
1581 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1582 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1583 pixman_composite_over_8888_n_8888_process_pixblock_tail_head
1585 /******************************************************************************/
1587 /* TODO: expand macros and do better instructions scheduling */
1588 .macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
1589 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1590 pixman_composite_over_8888_n_8888_process_pixblock_tail
1591 vld4.8 {d0, d1, d2, d3}, [SRC]!
1593 vld4.8 {d12, d13, d14, d15}, [MASK]!
1594 pixman_composite_over_8888_n_8888_process_pixblock_head
1595 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1598 generate_composite_function \
1599 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
1600 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1601 8, /* number of pixels, processed in a single block */ \
1602 5, /* prefetch distance */ \
1603 default_init_need_all_regs, \
1604 default_cleanup_need_all_regs, \
1605 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1606 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1607 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
1608 28, /* dst_w_basereg */ \
1609 4, /* dst_r_basereg */ \
1610 0, /* src_basereg */ \
1611 12 /* mask_basereg */
1613 generate_composite_function_single_scanline \
1614 pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
1615 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1616 8, /* number of pixels, processed in a single block */ \
1617 default_init_need_all_regs, \
1618 default_cleanup_need_all_regs, \
1619 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1620 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1621 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
1622 28, /* dst_w_basereg */ \
1623 4, /* dst_r_basereg */ \
1624 0, /* src_basereg */ \
1625 12 /* mask_basereg */
1627 /******************************************************************************/
1629 /* TODO: expand macros and do better instructions scheduling */
1630 .macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
1631 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1632 pixman_composite_over_8888_n_8888_process_pixblock_tail
1633 vld4.8 {d0, d1, d2, d3}, [SRC]!
1635 vld1.8 {d15}, [MASK]!
1636 pixman_composite_over_8888_n_8888_process_pixblock_head
1637 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1640 generate_composite_function \
1641 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
1642 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1643 8, /* number of pixels, processed in a single block */ \
1644 5, /* prefetch distance */ \
1645 default_init_need_all_regs, \
1646 default_cleanup_need_all_regs, \
1647 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1648 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1649 pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
1650 28, /* dst_w_basereg */ \
1651 4, /* dst_r_basereg */ \
1652 0, /* src_basereg */ \
1653 15 /* mask_basereg */
1655 /******************************************************************************/
1657 .macro pixman_composite_src_0888_0888_process_pixblock_head
1660 .macro pixman_composite_src_0888_0888_process_pixblock_tail
1663 .macro pixman_composite_src_0888_0888_process_pixblock_tail_head
1664 vst3.8 {d0, d1, d2}, [DST_W]!
1665 vld3.8 {d0, d1, d2}, [SRC]!
1669 generate_composite_function \
1670 pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
1671 FLAG_DST_WRITEONLY, \
1672 8, /* number of pixels, processed in a single block */ \
1673 10, /* prefetch distance */ \
1676 pixman_composite_src_0888_0888_process_pixblock_head, \
1677 pixman_composite_src_0888_0888_process_pixblock_tail, \
1678 pixman_composite_src_0888_0888_process_pixblock_tail_head, \
1679 0, /* dst_w_basereg */ \
1680 0, /* dst_r_basereg */ \
1681 0, /* src_basereg */ \
1682 0 /* mask_basereg */
1684 /******************************************************************************/
1686 .macro pixman_composite_src_0888_8888_rev_process_pixblock_head
1690 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
1693 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
1694 vst4.8 {d0, d1, d2, d3}, [DST_W]!
1695 vld3.8 {d0, d1, d2}, [SRC]!
1700 .macro pixman_composite_src_0888_8888_rev_init
1704 generate_composite_function \
1705 pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
1706 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1707 8, /* number of pixels, processed in a single block */ \
1708 10, /* prefetch distance */ \
1709 pixman_composite_src_0888_8888_rev_init, \
1711 pixman_composite_src_0888_8888_rev_process_pixblock_head, \
1712 pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
1713 pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
1714 0, /* dst_w_basereg */ \
1715 0, /* dst_r_basereg */ \
1716 0, /* src_basereg */ \
1717 0 /* mask_basereg */
1719 /******************************************************************************/
1721 .macro pixman_composite_src_0888_0565_rev_process_pixblock_head
1726 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
1727 vshll.u8 q14, d0, #8
1728 vsri.u16 q14, q8, #5
1729 vsri.u16 q14, q9, #11
1732 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
1733 vshll.u8 q14, d0, #8
1734 vld3.8 {d0, d1, d2}, [SRC]!
1735 vsri.u16 q14, q8, #5
1736 vsri.u16 q14, q9, #11
1738 vst1.16 {d28, d29}, [DST_W, :128]!
1742 generate_composite_function \
1743 pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
1744 FLAG_DST_WRITEONLY, \
1745 8, /* number of pixels, processed in a single block */ \
1746 10, /* prefetch distance */ \
1749 pixman_composite_src_0888_0565_rev_process_pixblock_head, \
1750 pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
1751 pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
1752 28, /* dst_w_basereg */ \
1753 0, /* dst_r_basereg */ \
1754 0, /* src_basereg */ \
1755 0 /* mask_basereg */
1757 /******************************************************************************/
1759 .macro pixman_composite_src_pixbuf_8888_process_pixblock_head
1762 vmull.u8 q10, d3, d2
1765 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
1766 vrshr.u16 q11, q8, #8
1768 vrshr.u16 q12, q9, #8
1769 vrshr.u16 q13, q10, #8
1770 vraddhn.u16 d30, q11, q8
1771 vraddhn.u16 d29, q12, q9
1772 vraddhn.u16 d28, q13, q10
1775 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
1776 vrshr.u16 q11, q8, #8
1778 vrshr.u16 q12, q9, #8
1779 vrshr.u16 q13, q10, #8
1780 vld4.8 {d0, d1, d2, d3}, [SRC]!
1781 vraddhn.u16 d30, q11, q8
1782 PF add PF_X, PF_X, #8
1784 PF addne PF_X, PF_X, #8
1785 PF subne PF_CTL, PF_CTL, #1
1786 vraddhn.u16 d29, q12, q9
1787 vraddhn.u16 d28, q13, q10
1790 vmull.u8 q10, d3, d2
1791 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1793 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
1794 PF subge PF_X, PF_X, ORIG_W
1795 PF subges PF_CTL, PF_CTL, #0x10
1796 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1799 generate_composite_function \
1800 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
1801 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1802 8, /* number of pixels, processed in a single block */ \
1803 10, /* prefetch distance */ \
1806 pixman_composite_src_pixbuf_8888_process_pixblock_head, \
1807 pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
1808 pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
1809 28, /* dst_w_basereg */ \
1810 0, /* dst_r_basereg */ \
1811 0, /* src_basereg */ \
1812 0 /* mask_basereg */
1814 /******************************************************************************/
1816 .macro pixman_composite_over_0565_8_0565_process_pixblock_head
1817 /* mask is in d15 */
1818 convert_0565_to_x888 q4, d2, d1, d0
1819 convert_0565_to_x888 q5, d6, d5, d4
1820 /* source pixel data is in {d0, d1, d2, XX} */
1821 /* destination pixel data is in {d4, d5, d6, XX} */
1823 vmull.u8 q6, d15, d2
1824 vmull.u8 q5, d15, d1
1825 vmull.u8 q4, d15, d0
1828 vmull.u8 q13, d7, d6
1829 vrshr.u16 q12, q6, #8
1830 vrshr.u16 q11, q5, #8
1831 vrshr.u16 q10, q4, #8
1832 vraddhn.u16 d2, q6, q12
1833 vraddhn.u16 d1, q5, q11
1834 vraddhn.u16 d0, q4, q10
1837 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail
1838 vrshr.u16 q14, q8, #8
1839 vrshr.u16 q15, q9, #8
1840 vrshr.u16 q12, q13, #8
1841 vraddhn.u16 d28, q14, q8
1842 vraddhn.u16 d29, q15, q9
1843 vraddhn.u16 d30, q12, q13
1844 vqadd.u8 q0, q0, q14
1845 vqadd.u8 q1, q1, q15
1846 /* 32bpp result is in {d0, d1, d2, XX} */
1847 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
1850 /* TODO: expand macros and do better instructions scheduling */
1851 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
1852 vld1.8 {d15}, [MASK]!
1853 pixman_composite_over_0565_8_0565_process_pixblock_tail
1854 vld1.16 {d8, d9}, [SRC]!
1855 vld1.16 {d10, d11}, [DST_R, :128]!
1857 pixman_composite_over_0565_8_0565_process_pixblock_head
1858 vst1.16 {d28, d29}, [DST_W, :128]!
1861 generate_composite_function \
1862 pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
1863 FLAG_DST_READWRITE, \
1864 8, /* number of pixels, processed in a single block */ \
1865 5, /* prefetch distance */ \
1866 default_init_need_all_regs, \
1867 default_cleanup_need_all_regs, \
1868 pixman_composite_over_0565_8_0565_process_pixblock_head, \
1869 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
1870 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
1871 28, /* dst_w_basereg */ \
1872 10, /* dst_r_basereg */ \
1873 8, /* src_basereg */ \
1874 15 /* mask_basereg */
1876 /******************************************************************************/
1878 .macro pixman_composite_add_0565_8_0565_process_pixblock_head
1879 /* mask is in d15 */
1880 convert_0565_to_x888 q4, d2, d1, d0
1881 convert_0565_to_x888 q5, d6, d5, d4
1882 /* source pixel data is in {d0, d1, d2, XX} */
1883 /* destination pixel data is in {d4, d5, d6, XX} */
1884 vmull.u8 q6, d15, d2
1885 vmull.u8 q5, d15, d1
1886 vmull.u8 q4, d15, d0
1887 vrshr.u16 q12, q6, #8
1888 vrshr.u16 q11, q5, #8
1889 vrshr.u16 q10, q4, #8
1890 vraddhn.u16 d2, q6, q12
1891 vraddhn.u16 d1, q5, q11
1892 vraddhn.u16 d0, q4, q10
1895 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail
1898 /* 32bpp result is in {d0, d1, d2, XX} */
1899 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
1902 /* TODO: expand macros and do better instructions scheduling */
1903 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
1904 vld1.8 {d15}, [MASK]!
1905 pixman_composite_add_0565_8_0565_process_pixblock_tail
1906 vld1.16 {d8, d9}, [SRC]!
1907 vld1.16 {d10, d11}, [DST_R, :128]!
1909 pixman_composite_add_0565_8_0565_process_pixblock_head
1910 vst1.16 {d28, d29}, [DST_W, :128]!
1913 generate_composite_function \
1914 pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
1915 FLAG_DST_READWRITE, \
1916 8, /* number of pixels, processed in a single block */ \
1917 5, /* prefetch distance */ \
1918 default_init_need_all_regs, \
1919 default_cleanup_need_all_regs, \
1920 pixman_composite_add_0565_8_0565_process_pixblock_head, \
1921 pixman_composite_add_0565_8_0565_process_pixblock_tail, \
1922 pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
1923 28, /* dst_w_basereg */ \
1924 10, /* dst_r_basereg */ \
1925 8, /* src_basereg */ \
1926 15 /* mask_basereg */
1928 /******************************************************************************/
1930 .macro pixman_composite_out_reverse_8_0565_process_pixblock_head
1931 /* mask is in d15 */
1932 convert_0565_to_x888 q5, d6, d5, d4
1933 /* destination pixel data is in {d4, d5, d6, xx} */
1934 vmvn.8 d24, d15 /* get inverted alpha */
1935 /* now do alpha blending */
1936 vmull.u8 q8, d24, d4
1937 vmull.u8 q9, d24, d5
1938 vmull.u8 q10, d24, d6
1941 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
1942 vrshr.u16 q14, q8, #8
1943 vrshr.u16 q15, q9, #8
1944 vrshr.u16 q12, q10, #8
1945 vraddhn.u16 d0, q14, q8
1946 vraddhn.u16 d1, q15, q9
1947 vraddhn.u16 d2, q12, q10
1948 /* 32bpp result is in {d0, d1, d2, XX} */
1949 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
1952 /* TODO: expand macros and do better instructions scheduling */
1953 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
1954 vld1.8 {d15}, [SRC]!
1955 pixman_composite_out_reverse_8_0565_process_pixblock_tail
1956 vld1.16 {d10, d11}, [DST_R, :128]!
1958 pixman_composite_out_reverse_8_0565_process_pixblock_head
1959 vst1.16 {d28, d29}, [DST_W, :128]!
1962 generate_composite_function \
1963 pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
1964 FLAG_DST_READWRITE, \
1965 8, /* number of pixels, processed in a single block */ \
1966 5, /* prefetch distance */ \
1967 default_init_need_all_regs, \
1968 default_cleanup_need_all_regs, \
1969 pixman_composite_out_reverse_8_0565_process_pixblock_head, \
1970 pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
1971 pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
1972 28, /* dst_w_basereg */ \
1973 10, /* dst_r_basereg */ \
1974 15, /* src_basereg */ \
1975 0 /* mask_basereg */