2 * Copyright © 2009 Nokia Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
27 * This file contains implementations of NEON optimized pixel processing
28 * functions. There is no full and detailed tutorial, but some functions
29 * (those which are exposing some new or interesting features) are
30 * extensively commented and can be used as examples.
32 * You may want to have a look at the comments for following functions:
33 * - pixman_composite_over_8888_0565_asm_neon
34 * - pixman_composite_over_n_8_0565_asm_neon
37 /* Prevent the stack from becoming executable for no reason... */
38 #if defined(__linux__) && defined(__ELF__)
39 .section .note.GNU-stack,"",%progbits
46 .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
47 .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
51 #include "pixman-arm-neon-asm.h"
53 /* Global configuration options and preferences */
56 * The code can optionally make use of unaligned memory accesses to improve
57 * performance of handling leading/trailing pixels for each scanline.
58 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
59 * example in linux if unaligned memory accesses are not configured to
60 * generate.exceptions.
62 .set RESPECT_STRICT_ALIGNMENT, 1
65 * Set default prefetch type. There is a choice between the following options:
67 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
68 * as NOP to workaround some HW bugs or for whatever other reason)
70 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
71 * advanced prefetch intruduces heavy overhead)
73 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
74 * which can run ARM and NEON instructions simultaneously so that extra ARM
75 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
77 * Note: some types of function can't support advanced prefetch and fallback
78 * to simple one (those which handle 24bpp pixels)
80 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
82 /* Prefetch distance in pixels for simple prefetch */
83 .set PREFETCH_DISTANCE_SIMPLE, 64
86 * Implementation of pixman_composite_over_8888_0565_asm_neon
88 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
89 * performs OVER compositing operation. Function fast_composite_over_8888_0565
90 * from pixman-fast-path.c does the same in C and can be used as a reference.
92 * First we need to have some NEON assembly code which can do the actual
93 * operation on the pixels and provide it to the template macro.
95 * Template macro quite conveniently takes care of emitting all the necessary
96 * code for memory reading and writing (including quite tricky cases of
97 * handling unaligned leading/trailing pixels), so we only need to deal with
98 * the data in NEON registers.
100 * NEON registers allocation in general is recommented to be the following:
101 * d0, d1, d2, d3 - contain loaded source pixel data
102 * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed)
103 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
104 * d28, d29, d30, d31 - place for storing the result (destination pixels)
106 * As can be seen above, four 64-bit NEON registers are used for keeping
107 * intermediate pixel data and up to 8 pixels can be processed in one step
108 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
110 * This particular function uses the following registers allocation:
111 * d0, d1, d2, d3 - contain loaded source pixel data
112 * d4, d5 - contain loaded destination pixels (they are needed)
113 * d28, d29 - place for storing the result (destination pixels)
117 * Step one. We need to have some code to do some arithmetics on pixel data.
118 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
119 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
120 * perform all the needed calculations and write the result to {d28, d29}.
121 * The rationale for having two macros and not just one will be explained
122 * later. In practice, any single monolitic function which does the work can
123 * be split into two parts in any arbitrary way without affecting correctness.
125 * There is one special trick here too. Common template macro can optionally
126 * make our life a bit easier by doing R, G, B, A color components
127 * deinterleaving for 32bpp pixel formats (and this feature is used in
128 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
129 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
130 * actually use d0 register for blue channel (a vector of eight 8-bit
131 * values), d1 register for green, d2 for red and d3 for alpha. This
132 * simple conversion can be also done with a few NEON instructions:
134 * Packed to planar conversion:
140 * Planar to packed conversion:
146 * But pixel can be loaded directly in planar format using VLD4.8 NEON
147 * instruction. It is 1 cycle slower than VLD1.32, so this is not always
148 * desirable, that's why deinterleaving is optional.
150 * But anyway, here is the code:
152 .macro pixman_composite_over_8888_0565_process_pixblock_head
153 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
154 and put data into d6 - red, d7 - green, d30 - blue */
159 vmvn.8 d3, d3 /* invert source alpha */
161 vshrn.u16 d30, q2, #2
162 /* now do alpha blending, storing results in 8-bit planar format
163 into d16 - red, d19 - green, d18 - blue */
166 vmull.u8 q12, d3, d30
167 vrshr.u16 q13, q10, #8
168 vrshr.u16 q3, q11, #8
169 vrshr.u16 q15, q12, #8
170 vraddhn.u16 d20, q10, q13
171 vraddhn.u16 d23, q11, q3
172 vraddhn.u16 d22, q12, q15
175 .macro pixman_composite_over_8888_0565_process_pixblock_tail
176 /* ... continue alpha blending */
177 vqadd.u8 d16, d2, d20
179 /* convert the result to r5g6b5 and store it into {d28, d29} */
180 vshll.u8 q14, d16, #8
184 vsri.u16 q14, q9, #11
188 * OK, now we got almost everything that we need. Using the above two
189 * macros, the work can be done right. But now we want to optimize
190 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
191 * a lot from good code scheduling and software pipelining.
193 * Let's construct some code, which will run in the core main loop.
194 * Some pseudo-code of the main loop will look like this:
202 * It may look a bit weird, but this setup allows to hide instruction
203 * latencies better and also utilize dual-issue capability more
204 * efficiently (make pairs of load-store and ALU instructions).
206 * So what we need now is a '*_tail_head' macro, which will be used
207 * in the core main loop. A trivial straightforward implementation
208 * of this macro would look like this:
210 * pixman_composite_over_8888_0565_process_pixblock_tail
211 * vst1.16 {d28, d29}, [DST_W, :128]!
212 * vld1.16 {d4, d5}, [DST_R, :128]!
213 * vld4.32 {d0, d1, d2, d3}, [SRC]!
214 * pixman_composite_over_8888_0565_process_pixblock_head
217 * Now it also got some VLD/VST instructions. We simply can't move from
218 * processing one block of pixels to the other one with just arithmetics.
219 * The previously processed data needs to be written to memory and new
220 * data needs to be fetched. Fortunately, this main loop does not deal
221 * with partial leading/trailing pixels and can load/store a full block
222 * of pixels in a bulk. Additionally, destination buffer is already
223 * 16 bytes aligned here (which is good for performance).
225 * New things here are DST_R, DST_W, SRC and MASK identifiers. These
226 * are the aliases for ARM registers which are used as pointers for
227 * accessing data. We maintain separate pointers for reading and writing
228 * destination buffer (DST_R and DST_W).
230 * Another new thing is 'cache_preload' macro. It is used for prefetching
231 * data into CPU L2 cache and improve performance when dealing with large
232 * images which are far larger than cache size. It uses one argument
233 * (actually two, but they need to be the same here) - number of pixels
234 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
235 * details about this macro. Moreover, if good performance is needed
236 * the code from this macro needs to be copied into '*_tail_head' macro
237 * and mixed with the rest of code for optimal instructions scheduling.
238 * We are actually doing it below.
240 * Now after all the explanations, here is the optimized code.
241 * Different instruction streams (originaling from '*_head', '*_tail'
242 * and 'cache_preload' macro) use different indentation levels for
243 * better readability. Actually taking the code from one of these
244 * indentation levels and ignoring a few VLD/VST instructions would
245 * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
251 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
252 vqadd.u8 d16, d2, d20
253 vld1.16 {d4, d5}, [DST_R, :128]!
259 vshll.u8 q14, d16, #8
260 PF add PF_X, PF_X, #8
264 PF addne PF_X, PF_X, #8
266 PF subne PF_CTL, PF_CTL, #1
268 vshrn.u16 d30, q2, #2
270 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
272 vmull.u8 q12, d3, d30
273 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
277 vrshr.u16 q13, q10, #8
278 PF subge PF_X, PF_X, ORIG_W
279 vrshr.u16 q3, q11, #8
280 vrshr.u16 q15, q12, #8
281 PF subges PF_CTL, PF_CTL, #0x10
282 vsri.u16 q14, q9, #11
283 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
284 vraddhn.u16 d20, q10, q13
285 vraddhn.u16 d23, q11, q3
286 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
287 vraddhn.u16 d22, q12, q15
288 vst1.16 {d28, d29}, [DST_W, :128]!
293 /* If we did not care much about the performance, we would just use this... */
294 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
295 pixman_composite_over_8888_0565_process_pixblock_tail
296 vst1.16 {d28, d29}, [DST_W, :128]!
297 vld1.16 {d4, d5}, [DST_R, :128]!
299 pixman_composite_over_8888_0565_process_pixblock_head
306 * And now the final part. We are using 'generate_composite_function' macro
307 * to put all the stuff together. We are specifying the name of the function
308 * which we want to get, number of bits per pixel for the source, mask and
309 * destination (0 if unused, like mask in this case). Next come some bit
311 * FLAG_DST_READWRITE - tells that the destination buffer is both read
312 * and written, for write-only buffer we would use
313 * FLAG_DST_WRITEONLY flag instead
314 * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
315 * and separate color channels for 32bpp format.
316 * The next things are:
317 * - the number of pixels processed per iteration (8 in this case, because
318 * that's the maximum what can fit into four 64-bit NEON registers).
319 * - prefetch distance, measured in pixel blocks. In this case it is 5 times
320 * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
321 * prefetch distance can be selected by running some benchmarks.
323 * After that we specify some macros, these are 'default_init',
324 * 'default_cleanup' here which are empty (but it is possible to have custom
325 * init/cleanup macros to be able to save/restore some extra NEON registers
326 * like d8-d15 or do anything else) followed by
327 * 'pixman_composite_over_8888_0565_process_pixblock_head',
328 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
329 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
330 * which we got implemented above.
332 * The last part is the NEON registers allocation scheme.
334 generate_composite_function \
335 pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
336 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
337 8, /* number of pixels, processed in a single block */ \
338 5, /* prefetch distance */ \
341 pixman_composite_over_8888_0565_process_pixblock_head, \
342 pixman_composite_over_8888_0565_process_pixblock_tail, \
343 pixman_composite_over_8888_0565_process_pixblock_tail_head, \
344 28, /* dst_w_basereg */ \
345 4, /* dst_r_basereg */ \
346 0, /* src_basereg */ \
347 24 /* mask_basereg */
349 /******************************************************************************/
351 .macro pixman_composite_over_n_0565_process_pixblock_head
352 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
353 and put data into d6 - red, d7 - green, d30 - blue */
359 vshrn.u16 d30, q2, #2
360 /* now do alpha blending, storing results in 8-bit planar format
361 into d16 - red, d19 - green, d18 - blue */
364 vmull.u8 q12, d3, d30
365 vrshr.u16 q13, q10, #8
366 vrshr.u16 q3, q11, #8
367 vrshr.u16 q15, q12, #8
368 vraddhn.u16 d20, q10, q13
369 vraddhn.u16 d23, q11, q3
370 vraddhn.u16 d22, q12, q15
373 .macro pixman_composite_over_n_0565_process_pixblock_tail
374 /* ... continue alpha blending */
375 vqadd.u8 d16, d2, d20
377 /* convert the result to r5g6b5 and store it into {d28, d29} */
378 vshll.u8 q14, d16, #8
382 vsri.u16 q14, q9, #11
385 /* TODO: expand macros and do better instructions scheduling */
386 .macro pixman_composite_over_n_0565_process_pixblock_tail_head
387 pixman_composite_over_n_0565_process_pixblock_tail
388 vld1.16 {d4, d5}, [DST_R, :128]!
389 vst1.16 {d28, d29}, [DST_W, :128]!
390 pixman_composite_over_n_0565_process_pixblock_head
394 .macro pixman_composite_over_n_0565_init
395 add DUMMY, sp, #ARGS_STACK_OFFSET
396 vld1.32 {d3[0]}, [DUMMY]
401 vmvn.8 d3, d3 /* invert source alpha */
404 generate_composite_function \
405 pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
406 FLAG_DST_READWRITE, \
407 8, /* number of pixels, processed in a single block */ \
408 5, /* prefetch distance */ \
409 pixman_composite_over_n_0565_init, \
411 pixman_composite_over_n_0565_process_pixblock_head, \
412 pixman_composite_over_n_0565_process_pixblock_tail, \
413 pixman_composite_over_n_0565_process_pixblock_tail_head, \
414 28, /* dst_w_basereg */ \
415 4, /* dst_r_basereg */ \
416 0, /* src_basereg */ \
417 24 /* mask_basereg */
419 /******************************************************************************/
421 .macro pixman_composite_src_8888_0565_process_pixblock_head
427 .macro pixman_composite_src_8888_0565_process_pixblock_tail
429 vsri.u16 q14, q9, #11
432 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
434 PF add PF_X, PF_X, #8
437 PF addne PF_X, PF_X, #8
438 PF subne PF_CTL, PF_CTL, #1
439 vsri.u16 q14, q9, #11
441 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
443 vst1.16 {d28, d29}, [DST_W, :128]!
444 PF subge PF_X, PF_X, ORIG_W
445 PF subges PF_CTL, PF_CTL, #0x10
447 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
451 generate_composite_function \
452 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
453 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
454 8, /* number of pixels, processed in a single block */ \
455 10, /* prefetch distance */ \
458 pixman_composite_src_8888_0565_process_pixblock_head, \
459 pixman_composite_src_8888_0565_process_pixblock_tail, \
460 pixman_composite_src_8888_0565_process_pixblock_tail_head
462 /******************************************************************************/
464 .macro pixman_composite_src_0565_8888_process_pixblock_head
465 vshrn.u16 d30, q0, #8
466 vshrn.u16 d29, q0, #3
471 vshrn.u16 d28, q0, #2
474 .macro pixman_composite_src_0565_8888_process_pixblock_tail
477 /* TODO: expand macros and do better instructions scheduling */
478 .macro pixman_composite_src_0565_8888_process_pixblock_tail_head
479 pixman_composite_src_0565_8888_process_pixblock_tail
480 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
482 pixman_composite_src_0565_8888_process_pixblock_head
486 generate_composite_function \
487 pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
488 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
489 8, /* number of pixels, processed in a single block */ \
490 10, /* prefetch distance */ \
493 pixman_composite_src_0565_8888_process_pixblock_head, \
494 pixman_composite_src_0565_8888_process_pixblock_tail, \
495 pixman_composite_src_0565_8888_process_pixblock_tail_head
497 /******************************************************************************/
499 .macro pixman_composite_add_8_8_process_pixblock_head
504 .macro pixman_composite_add_8_8_process_pixblock_tail
507 .macro pixman_composite_add_8_8_process_pixblock_tail_head
509 PF add PF_X, PF_X, #32
511 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
512 PF addne PF_X, PF_X, #32
513 PF subne PF_CTL, PF_CTL, #1
514 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
516 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
517 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
518 PF subge PF_X, PF_X, ORIG_W
519 PF subges PF_CTL, PF_CTL, #0x10
521 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
522 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
526 generate_composite_function \
527 pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
528 FLAG_DST_READWRITE, \
529 32, /* number of pixels, processed in a single block */ \
530 10, /* prefetch distance */ \
533 pixman_composite_add_8_8_process_pixblock_head, \
534 pixman_composite_add_8_8_process_pixblock_tail, \
535 pixman_composite_add_8_8_process_pixblock_tail_head
537 /******************************************************************************/
539 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
541 PF add PF_X, PF_X, #8
543 vld1.32 {d4, d5, d6, d7}, [DST_R, :128]!
544 PF addne PF_X, PF_X, #8
545 PF subne PF_CTL, PF_CTL, #1
546 vst1.32 {d28, d29, d30, d31}, [DST_W, :128]!
548 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
549 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
550 PF subge PF_X, PF_X, ORIG_W
551 PF subges PF_CTL, PF_CTL, #0x10
553 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
554 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
558 generate_composite_function \
559 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
560 FLAG_DST_READWRITE, \
561 8, /* number of pixels, processed in a single block */ \
562 10, /* prefetch distance */ \
565 pixman_composite_add_8_8_process_pixblock_head, \
566 pixman_composite_add_8_8_process_pixblock_tail, \
567 pixman_composite_add_8888_8888_process_pixblock_tail_head
569 generate_composite_function_single_scanline \
570 pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
571 FLAG_DST_READWRITE, \
572 8, /* number of pixels, processed in a single block */ \
575 pixman_composite_add_8_8_process_pixblock_head, \
576 pixman_composite_add_8_8_process_pixblock_tail, \
577 pixman_composite_add_8888_8888_process_pixblock_tail_head
579 /******************************************************************************/
581 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
582 vmvn.8 d24, d3 /* get inverted alpha */
583 /* do alpha blending */
586 vmull.u8 q10, d24, d6
587 vmull.u8 q11, d24, d7
590 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
591 vrshr.u16 q14, q8, #8
592 vrshr.u16 q15, q9, #8
593 vrshr.u16 q12, q10, #8
594 vrshr.u16 q13, q11, #8
595 vraddhn.u16 d28, q14, q8
596 vraddhn.u16 d29, q15, q9
597 vraddhn.u16 d30, q12, q10
598 vraddhn.u16 d31, q13, q11
601 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
602 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
603 vrshr.u16 q14, q8, #8
604 PF add PF_X, PF_X, #8
606 vrshr.u16 q15, q9, #8
607 vrshr.u16 q12, q10, #8
608 vrshr.u16 q13, q11, #8
609 PF addne PF_X, PF_X, #8
610 PF subne PF_CTL, PF_CTL, #1
611 vraddhn.u16 d28, q14, q8
612 vraddhn.u16 d29, q15, q9
614 vraddhn.u16 d30, q12, q10
615 vraddhn.u16 d31, q13, q11
617 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
619 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
620 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
621 PF subge PF_X, PF_X, ORIG_W
623 PF subges PF_CTL, PF_CTL, #0x10
625 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
626 vmull.u8 q10, d22, d6
627 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
628 vmull.u8 q11, d22, d7
631 generate_composite_function_single_scanline \
632 pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
633 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
634 8, /* number of pixels, processed in a single block */ \
637 pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
638 pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
639 pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
641 /******************************************************************************/
643 .macro pixman_composite_over_8888_8888_process_pixblock_head
644 pixman_composite_out_reverse_8888_8888_process_pixblock_head
647 .macro pixman_composite_over_8888_8888_process_pixblock_tail
648 pixman_composite_out_reverse_8888_8888_process_pixblock_tail
649 vqadd.u8 q14, q0, q14
650 vqadd.u8 q15, q1, q15
653 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
654 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
655 vrshr.u16 q14, q8, #8
656 PF add PF_X, PF_X, #8
658 vrshr.u16 q15, q9, #8
659 vrshr.u16 q12, q10, #8
660 vrshr.u16 q13, q11, #8
661 PF addne PF_X, PF_X, #8
662 PF subne PF_CTL, PF_CTL, #1
663 vraddhn.u16 d28, q14, q8
664 vraddhn.u16 d29, q15, q9
666 vraddhn.u16 d30, q12, q10
667 vraddhn.u16 d31, q13, q11
668 vqadd.u8 q14, q0, q14
669 vqadd.u8 q15, q1, q15
671 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
673 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
674 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
675 PF subge PF_X, PF_X, ORIG_W
677 PF subges PF_CTL, PF_CTL, #0x10
679 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
680 vmull.u8 q10, d22, d6
681 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
682 vmull.u8 q11, d22, d7
685 generate_composite_function \
686 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
687 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
688 8, /* number of pixels, processed in a single block */ \
689 5, /* prefetch distance */ \
692 pixman_composite_over_8888_8888_process_pixblock_head, \
693 pixman_composite_over_8888_8888_process_pixblock_tail, \
694 pixman_composite_over_8888_8888_process_pixblock_tail_head
696 generate_composite_function_single_scanline \
697 pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
698 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
699 8, /* number of pixels, processed in a single block */ \
702 pixman_composite_over_8888_8888_process_pixblock_head, \
703 pixman_composite_over_8888_8888_process_pixblock_tail, \
704 pixman_composite_over_8888_8888_process_pixblock_tail_head
706 /******************************************************************************/
708 /* TODO: expand macros and do better instructions scheduling */
709 .macro pixman_composite_over_n_8888_process_pixblock_tail_head
710 pixman_composite_over_8888_8888_process_pixblock_tail
711 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
712 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
713 pixman_composite_over_8888_8888_process_pixblock_head
717 .macro pixman_composite_over_n_8888_init
718 add DUMMY, sp, #ARGS_STACK_OFFSET
719 vld1.32 {d3[0]}, [DUMMY]
726 generate_composite_function \
727 pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
728 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
729 8, /* number of pixels, processed in a single block */ \
730 5, /* prefetch distance */ \
731 pixman_composite_over_n_8888_init, \
733 pixman_composite_over_8888_8888_process_pixblock_head, \
734 pixman_composite_over_8888_8888_process_pixblock_tail, \
735 pixman_composite_over_n_8888_process_pixblock_tail_head
737 /******************************************************************************/
739 .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
740 vrshr.u16 q14, q8, #8
741 PF add PF_X, PF_X, #8
743 vrshr.u16 q15, q9, #8
744 vrshr.u16 q12, q10, #8
745 vrshr.u16 q13, q11, #8
746 PF addne PF_X, PF_X, #8
747 PF subne PF_CTL, PF_CTL, #1
748 vraddhn.u16 d28, q14, q8
749 vraddhn.u16 d29, q15, q9
751 vraddhn.u16 d30, q12, q10
752 vraddhn.u16 d31, q13, q11
753 vqadd.u8 q14, q0, q14
754 vqadd.u8 q15, q1, q15
755 vld4.8 {d0, d1, d2, d3}, [DST_R, :128]!
757 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
758 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
759 PF subge PF_X, PF_X, ORIG_W
761 PF subges PF_CTL, PF_CTL, #0x10
763 vmull.u8 q10, d22, d6
764 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
765 vmull.u8 q11, d22, d7
768 .macro pixman_composite_over_reverse_n_8888_init
769 add DUMMY, sp, #ARGS_STACK_OFFSET
770 vld1.32 {d7[0]}, [DUMMY]
777 generate_composite_function \
778 pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
779 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
780 8, /* number of pixels, processed in a single block */ \
781 5, /* prefetch distance */ \
782 pixman_composite_over_reverse_n_8888_init, \
784 pixman_composite_over_8888_8888_process_pixblock_head, \
785 pixman_composite_over_8888_8888_process_pixblock_tail, \
786 pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
787 28, /* dst_w_basereg */ \
788 0, /* dst_r_basereg */ \
789 4, /* src_basereg */ \
790 24 /* mask_basereg */
792 /******************************************************************************/
794 .macro pixman_composite_over_n_8_0565_process_pixblock_head
798 vmull.u8 q6, d24, d10
799 vmull.u8 q7, d24, d11
800 vrshr.u16 q10, q0, #8
801 vrshr.u16 q11, q1, #8
802 vrshr.u16 q12, q6, #8
803 vrshr.u16 q13, q7, #8
804 vraddhn.u16 d0, q0, q10
805 vraddhn.u16 d1, q1, q11
806 vraddhn.u16 d2, q6, q12
807 vraddhn.u16 d3, q7, q13
815 vshrn.u16 d30, q2, #2
816 /* now do alpha blending */
819 vmull.u8 q12, d3, d30
820 vrshr.u16 q13, q10, #8
821 vrshr.u16 q3, q11, #8
822 vrshr.u16 q15, q12, #8
823 vraddhn.u16 d20, q10, q13
824 vraddhn.u16 d23, q11, q3
825 vraddhn.u16 d22, q12, q15
828 .macro pixman_composite_over_n_8_0565_process_pixblock_tail
829 vqadd.u8 d16, d2, d20
831 /* convert to r5g6b5 */
832 vshll.u8 q14, d16, #8
836 vsri.u16 q14, q9, #11
839 /* TODO: expand macros and do better instructions scheduling */
840 .macro pixman_composite_over_n_8_0565_process_pixblock_tail_head
841 pixman_composite_over_n_8_0565_process_pixblock_tail
842 vst1.16 {d28, d29}, [DST_W, :128]!
843 vld1.16 {d4, d5}, [DST_R, :128]!
846 pixman_composite_over_n_8_0565_process_pixblock_head
850 * This function needs a special initialization of solid mask.
851 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
852 * offset, split into color components and replicated in d8-d11
853 * registers. Additionally, this function needs all the NEON registers,
854 * so it has to save d8-d15 registers which are callee saved according
855 * to ABI. These registers are restored from 'cleanup' macro. All the
856 * other NEON registers are caller saved, so can be clobbered freely
857 * without introducing any problems.
859 .macro pixman_composite_over_n_8_0565_init
860 add DUMMY, sp, #ARGS_STACK_OFFSET
862 vld1.32 {d11[0]}, [DUMMY]
869 .macro pixman_composite_over_n_8_0565_cleanup
873 generate_composite_function \
874 pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
875 FLAG_DST_READWRITE, \
876 8, /* number of pixels, processed in a single block */ \
877 5, /* prefetch distance */ \
878 pixman_composite_over_n_8_0565_init, \
879 pixman_composite_over_n_8_0565_cleanup, \
880 pixman_composite_over_n_8_0565_process_pixblock_head, \
881 pixman_composite_over_n_8_0565_process_pixblock_tail, \
882 pixman_composite_over_n_8_0565_process_pixblock_tail_head
884 /******************************************************************************/
886 /* TODO: expand macros and do better instructions scheduling */
887 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
888 vld1.16 {d4, d5}, [DST_R, :128]!
889 pixman_composite_over_n_8_0565_process_pixblock_tail
893 pixman_composite_over_n_8_0565_process_pixblock_head
894 vst1.16 {d28, d29}, [DST_W, :128]!
897 generate_composite_function \
898 pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
899 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
900 8, /* number of pixels, processed in a single block */ \
901 5, /* prefetch distance */ \
902 default_init_need_all_regs, \
903 default_cleanup_need_all_regs, \
904 pixman_composite_over_n_8_0565_process_pixblock_head, \
905 pixman_composite_over_n_8_0565_process_pixblock_tail, \
906 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
907 28, /* dst_w_basereg */ \
908 4, /* dst_r_basereg */ \
909 8, /* src_basereg */ \
910 24 /* mask_basereg */
912 /******************************************************************************/
914 .macro pixman_composite_src_0565_0565_process_pixblock_head
917 .macro pixman_composite_src_0565_0565_process_pixblock_tail
920 .macro pixman_composite_src_0565_0565_process_pixblock_tail_head
921 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
926 generate_composite_function \
927 pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
928 FLAG_DST_WRITEONLY, \
929 16, /* number of pixels, processed in a single block */ \
930 10, /* prefetch distance */ \
933 pixman_composite_src_0565_0565_process_pixblock_head, \
934 pixman_composite_src_0565_0565_process_pixblock_tail, \
935 pixman_composite_src_0565_0565_process_pixblock_tail_head, \
936 0, /* dst_w_basereg */ \
937 0, /* dst_r_basereg */ \
938 0, /* src_basereg */ \
941 /******************************************************************************/
943 .macro pixman_composite_src_n_8_process_pixblock_head
946 .macro pixman_composite_src_n_8_process_pixblock_tail
949 .macro pixman_composite_src_n_8_process_pixblock_tail_head
950 vst1.8 {d0, d1, d2, d3}, [DST_W, :128]!
953 .macro pixman_composite_src_n_8_init
954 add DUMMY, sp, #ARGS_STACK_OFFSET
955 vld1.32 {d0[0]}, [DUMMY]
963 .macro pixman_composite_src_n_8_cleanup
966 generate_composite_function \
967 pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
968 FLAG_DST_WRITEONLY, \
969 32, /* number of pixels, processed in a single block */ \
970 0, /* prefetch distance */ \
971 pixman_composite_src_n_8_init, \
972 pixman_composite_src_n_8_cleanup, \
973 pixman_composite_src_n_8_process_pixblock_head, \
974 pixman_composite_src_n_8_process_pixblock_tail, \
975 pixman_composite_src_n_8_process_pixblock_tail_head, \
976 0, /* dst_w_basereg */ \
977 0, /* dst_r_basereg */ \
978 0, /* src_basereg */ \
981 /******************************************************************************/
983 .macro pixman_composite_src_n_0565_process_pixblock_head
986 .macro pixman_composite_src_n_0565_process_pixblock_tail
989 .macro pixman_composite_src_n_0565_process_pixblock_tail_head
990 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
993 .macro pixman_composite_src_n_0565_init
994 add DUMMY, sp, #ARGS_STACK_OFFSET
995 vld1.32 {d0[0]}, [DUMMY]
1002 .macro pixman_composite_src_n_0565_cleanup
1005 generate_composite_function \
1006 pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
1007 FLAG_DST_WRITEONLY, \
1008 16, /* number of pixels, processed in a single block */ \
1009 0, /* prefetch distance */ \
1010 pixman_composite_src_n_0565_init, \
1011 pixman_composite_src_n_0565_cleanup, \
1012 pixman_composite_src_n_0565_process_pixblock_head, \
1013 pixman_composite_src_n_0565_process_pixblock_tail, \
1014 pixman_composite_src_n_0565_process_pixblock_tail_head, \
1015 0, /* dst_w_basereg */ \
1016 0, /* dst_r_basereg */ \
1017 0, /* src_basereg */ \
1018 0 /* mask_basereg */
1020 /******************************************************************************/
1022 .macro pixman_composite_src_n_8888_process_pixblock_head
1025 .macro pixman_composite_src_n_8888_process_pixblock_tail
1028 .macro pixman_composite_src_n_8888_process_pixblock_tail_head
1029 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1032 .macro pixman_composite_src_n_8888_init
1033 add DUMMY, sp, #ARGS_STACK_OFFSET
1034 vld1.32 {d0[0]}, [DUMMY]
1035 vsli.u64 d0, d0, #32
1040 .macro pixman_composite_src_n_8888_cleanup
1043 generate_composite_function \
1044 pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
1045 FLAG_DST_WRITEONLY, \
1046 8, /* number of pixels, processed in a single block */ \
1047 0, /* prefetch distance */ \
1048 pixman_composite_src_n_8888_init, \
1049 pixman_composite_src_n_8888_cleanup, \
1050 pixman_composite_src_n_8888_process_pixblock_head, \
1051 pixman_composite_src_n_8888_process_pixblock_tail, \
1052 pixman_composite_src_n_8888_process_pixblock_tail_head, \
1053 0, /* dst_w_basereg */ \
1054 0, /* dst_r_basereg */ \
1055 0, /* src_basereg */ \
1056 0 /* mask_basereg */
1058 /******************************************************************************/
1060 .macro pixman_composite_src_8888_8888_process_pixblock_head
1063 .macro pixman_composite_src_8888_8888_process_pixblock_tail
1066 .macro pixman_composite_src_8888_8888_process_pixblock_tail_head
1067 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1072 generate_composite_function \
1073 pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
1074 FLAG_DST_WRITEONLY, \
1075 8, /* number of pixels, processed in a single block */ \
1076 10, /* prefetch distance */ \
1079 pixman_composite_src_8888_8888_process_pixblock_head, \
1080 pixman_composite_src_8888_8888_process_pixblock_tail, \
1081 pixman_composite_src_8888_8888_process_pixblock_tail_head, \
1082 0, /* dst_w_basereg */ \
1083 0, /* dst_r_basereg */ \
1084 0, /* src_basereg */ \
1085 0 /* mask_basereg */
1087 /******************************************************************************/
1089 .macro pixman_composite_src_x888_8888_process_pixblock_head
1094 .macro pixman_composite_src_x888_8888_process_pixblock_tail
1097 .macro pixman_composite_src_x888_8888_process_pixblock_tail_head
1098 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1105 .macro pixman_composite_src_x888_8888_init
1107 vshl.u32 q2, q2, #24
1110 generate_composite_function \
1111 pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
1112 FLAG_DST_WRITEONLY, \
1113 8, /* number of pixels, processed in a single block */ \
1114 10, /* prefetch distance */ \
1115 pixman_composite_src_x888_8888_init, \
1117 pixman_composite_src_x888_8888_process_pixblock_head, \
1118 pixman_composite_src_x888_8888_process_pixblock_tail, \
1119 pixman_composite_src_x888_8888_process_pixblock_tail_head, \
1120 0, /* dst_w_basereg */ \
1121 0, /* dst_r_basereg */ \
1122 0, /* src_basereg */ \
1123 0 /* mask_basereg */
1125 /******************************************************************************/
1127 .macro pixman_composite_over_n_8_8888_process_pixblock_head
1128 /* expecting deinterleaved source data in {d8, d9, d10, d11} */
1129 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1130 /* and destination data in {d4, d5, d6, d7} */
1131 /* mask is in d24 (d25, d26, d27 are unused) */
1134 vmull.u8 q0, d24, d8
1135 vmull.u8 q1, d24, d9
1136 vmull.u8 q6, d24, d10
1137 vmull.u8 q7, d24, d11
1138 vrshr.u16 q10, q0, #8
1139 vrshr.u16 q11, q1, #8
1140 vrshr.u16 q12, q6, #8
1141 vrshr.u16 q13, q7, #8
1142 vraddhn.u16 d0, q0, q10
1143 vraddhn.u16 d1, q1, q11
1144 vraddhn.u16 d2, q6, q12
1145 vraddhn.u16 d3, q7, q13
1146 vmvn.8 d24, d3 /* get inverted alpha */
1147 /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */
1148 /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
1149 /* now do alpha blending */
1150 vmull.u8 q8, d24, d4
1151 vmull.u8 q9, d24, d5
1152 vmull.u8 q10, d24, d6
1153 vmull.u8 q11, d24, d7
1156 .macro pixman_composite_over_n_8_8888_process_pixblock_tail
1157 vrshr.u16 q14, q8, #8
1158 vrshr.u16 q15, q9, #8
1159 vrshr.u16 q12, q10, #8
1160 vrshr.u16 q13, q11, #8
1161 vraddhn.u16 d28, q14, q8
1162 vraddhn.u16 d29, q15, q9
1163 vraddhn.u16 d30, q12, q10
1164 vraddhn.u16 d31, q13, q11
1165 vqadd.u8 q14, q0, q14
1166 vqadd.u8 q15, q1, q15
1169 /* TODO: expand macros and do better instructions scheduling */
1170 .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
1171 pixman_composite_over_n_8_8888_process_pixblock_tail
1172 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1173 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1176 pixman_composite_over_n_8_8888_process_pixblock_head
1179 .macro pixman_composite_over_n_8_8888_init
1180 add DUMMY, sp, #ARGS_STACK_OFFSET
1182 vld1.32 {d11[0]}, [DUMMY]
1189 .macro pixman_composite_over_n_8_8888_cleanup
1193 generate_composite_function \
1194 pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
1195 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1196 8, /* number of pixels, processed in a single block */ \
1197 5, /* prefetch distance */ \
1198 pixman_composite_over_n_8_8888_init, \
1199 pixman_composite_over_n_8_8888_cleanup, \
1200 pixman_composite_over_n_8_8888_process_pixblock_head, \
1201 pixman_composite_over_n_8_8888_process_pixblock_tail, \
1202 pixman_composite_over_n_8_8888_process_pixblock_tail_head
1204 /******************************************************************************/
1206 .macro pixman_composite_over_n_8_8_process_pixblock_head
1207 vmull.u8 q0, d24, d8
1208 vmull.u8 q1, d25, d8
1209 vmull.u8 q6, d26, d8
1210 vmull.u8 q7, d27, d8
1211 vrshr.u16 q10, q0, #8
1212 vrshr.u16 q11, q1, #8
1213 vrshr.u16 q12, q6, #8
1214 vrshr.u16 q13, q7, #8
1215 vraddhn.u16 d0, q0, q10
1216 vraddhn.u16 d1, q1, q11
1217 vraddhn.u16 d2, q6, q12
1218 vraddhn.u16 d3, q7, q13
1221 vmull.u8 q8, d24, d4
1222 vmull.u8 q9, d25, d5
1223 vmull.u8 q10, d26, d6
1224 vmull.u8 q11, d27, d7
1227 .macro pixman_composite_over_n_8_8_process_pixblock_tail
1228 vrshr.u16 q14, q8, #8
1229 vrshr.u16 q15, q9, #8
1230 vrshr.u16 q12, q10, #8
1231 vrshr.u16 q13, q11, #8
1232 vraddhn.u16 d28, q14, q8
1233 vraddhn.u16 d29, q15, q9
1234 vraddhn.u16 d30, q12, q10
1235 vraddhn.u16 d31, q13, q11
1236 vqadd.u8 q14, q0, q14
1237 vqadd.u8 q15, q1, q15
1240 /* TODO: expand macros and do better instructions scheduling */
1241 .macro pixman_composite_over_n_8_8_process_pixblock_tail_head
1242 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1243 pixman_composite_over_n_8_8_process_pixblock_tail
1245 cache_preload 32, 32
1246 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1247 pixman_composite_over_n_8_8_process_pixblock_head
1250 .macro pixman_composite_over_n_8_8_init
1251 add DUMMY, sp, #ARGS_STACK_OFFSET
1253 vld1.32 {d8[0]}, [DUMMY]
1257 .macro pixman_composite_over_n_8_8_cleanup
1261 generate_composite_function \
1262 pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
1263 FLAG_DST_READWRITE, \
1264 32, /* number of pixels, processed in a single block */ \
1265 5, /* prefetch distance */ \
1266 pixman_composite_over_n_8_8_init, \
1267 pixman_composite_over_n_8_8_cleanup, \
1268 pixman_composite_over_n_8_8_process_pixblock_head, \
1269 pixman_composite_over_n_8_8_process_pixblock_tail, \
1270 pixman_composite_over_n_8_8_process_pixblock_tail_head
1272 /******************************************************************************/
1274 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1276 * 'combine_mask_ca' replacement
1278 * input: solid src (n) in {d8, d9, d10, d11}
1279 * dest in {d4, d5, d6, d7 }
1280 * mask in {d24, d25, d26, d27}
1281 * output: updated src in {d0, d1, d2, d3 }
1282 * updated mask in {d24, d25, d26, d3 }
1284 vmull.u8 q0, d24, d8
1285 vmull.u8 q1, d25, d9
1286 vmull.u8 q6, d26, d10
1287 vmull.u8 q7, d27, d11
1288 vmull.u8 q9, d11, d25
1289 vmull.u8 q12, d11, d24
1290 vmull.u8 q13, d11, d26
1291 vrshr.u16 q8, q0, #8
1292 vrshr.u16 q10, q1, #8
1293 vrshr.u16 q11, q6, #8
1294 vraddhn.u16 d0, q0, q8
1295 vraddhn.u16 d1, q1, q10
1296 vraddhn.u16 d2, q6, q11
1297 vrshr.u16 q11, q12, #8
1298 vrshr.u16 q8, q9, #8
1299 vrshr.u16 q6, q13, #8
1300 vrshr.u16 q10, q7, #8
1301 vraddhn.u16 d24, q12, q11
1302 vraddhn.u16 d25, q9, q8
1303 vraddhn.u16 d26, q13, q6
1304 vraddhn.u16 d3, q7, q10
1306 * 'combine_over_ca' replacement
1308 * output: updated dest in {d28, d29, d30, d31}
1312 vmull.u8 q8, d24, d4
1313 vmull.u8 q9, d25, d5
1316 vmull.u8 q10, d26, d6
1317 vmull.u8 q11, d27, d7
1320 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
1321 /* ... continue 'combine_over_ca' replacement */
1322 vrshr.u16 q14, q8, #8
1323 vrshr.u16 q15, q9, #8
1324 vrshr.u16 q6, q10, #8
1325 vrshr.u16 q7, q11, #8
1326 vraddhn.u16 d28, q14, q8
1327 vraddhn.u16 d29, q15, q9
1328 vraddhn.u16 d30, q6, q10
1329 vraddhn.u16 d31, q7, q11
1330 vqadd.u8 q14, q0, q14
1331 vqadd.u8 q15, q1, q15
1334 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1335 vrshr.u16 q14, q8, #8
1336 vrshr.u16 q15, q9, #8
1337 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1338 vrshr.u16 q6, q10, #8
1339 vrshr.u16 q7, q11, #8
1340 vraddhn.u16 d28, q14, q8
1341 vraddhn.u16 d29, q15, q9
1342 vraddhn.u16 d30, q6, q10
1343 vraddhn.u16 d31, q7, q11
1345 vqadd.u8 q14, q0, q14
1346 vqadd.u8 q15, q1, q15
1348 pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1349 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1352 .macro pixman_composite_over_n_8888_8888_ca_init
1353 add DUMMY, sp, #ARGS_STACK_OFFSET
1355 vld1.32 {d11[0]}, [DUMMY]
1362 .macro pixman_composite_over_n_8888_8888_ca_cleanup
1366 generate_composite_function \
1367 pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
1368 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1369 8, /* number of pixels, processed in a single block */ \
1370 5, /* prefetch distance */ \
1371 pixman_composite_over_n_8888_8888_ca_init, \
1372 pixman_composite_over_n_8888_8888_ca_cleanup, \
1373 pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
1374 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
1375 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1377 /******************************************************************************/
1379 .macro pixman_composite_add_n_8_8_process_pixblock_head
1380 /* expecting source data in {d8, d9, d10, d11} */
1381 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1382 /* and destination data in {d4, d5, d6, d7} */
1383 /* mask is in d24, d25, d26, d27 */
1384 vmull.u8 q0, d24, d11
1385 vmull.u8 q1, d25, d11
1386 vmull.u8 q6, d26, d11
1387 vmull.u8 q7, d27, d11
1388 vrshr.u16 q10, q0, #8
1389 vrshr.u16 q11, q1, #8
1390 vrshr.u16 q12, q6, #8
1391 vrshr.u16 q13, q7, #8
1392 vraddhn.u16 d0, q0, q10
1393 vraddhn.u16 d1, q1, q11
1394 vraddhn.u16 d2, q6, q12
1395 vraddhn.u16 d3, q7, q13
1396 vqadd.u8 q14, q0, q2
1397 vqadd.u8 q15, q1, q3
1400 .macro pixman_composite_add_n_8_8_process_pixblock_tail
1403 /* TODO: expand macros and do better instructions scheduling */
1404 .macro pixman_composite_add_n_8_8_process_pixblock_tail_head
1405 pixman_composite_add_n_8_8_process_pixblock_tail
1406 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1407 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1409 cache_preload 32, 32
1410 pixman_composite_add_n_8_8_process_pixblock_head
1413 .macro pixman_composite_add_n_8_8_init
1414 add DUMMY, sp, #ARGS_STACK_OFFSET
1416 vld1.32 {d11[0]}, [DUMMY]
1420 .macro pixman_composite_add_n_8_8_cleanup
1424 generate_composite_function \
1425 pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
1426 FLAG_DST_READWRITE, \
1427 32, /* number of pixels, processed in a single block */ \
1428 5, /* prefetch distance */ \
1429 pixman_composite_add_n_8_8_init, \
1430 pixman_composite_add_n_8_8_cleanup, \
1431 pixman_composite_add_n_8_8_process_pixblock_head, \
1432 pixman_composite_add_n_8_8_process_pixblock_tail, \
1433 pixman_composite_add_n_8_8_process_pixblock_tail_head
1435 /******************************************************************************/
1437 .macro pixman_composite_add_8_8_8_process_pixblock_head
1438 /* expecting source data in {d0, d1, d2, d3} */
1439 /* destination data in {d4, d5, d6, d7} */
1440 /* mask in {d24, d25, d26, d27} */
1441 vmull.u8 q8, d24, d0
1442 vmull.u8 q9, d25, d1
1443 vmull.u8 q10, d26, d2
1444 vmull.u8 q11, d27, d3
1445 vrshr.u16 q0, q8, #8
1446 vrshr.u16 q1, q9, #8
1447 vrshr.u16 q12, q10, #8
1448 vrshr.u16 q13, q11, #8
1449 vraddhn.u16 d0, q0, q8
1450 vraddhn.u16 d1, q1, q9
1451 vraddhn.u16 d2, q12, q10
1452 vraddhn.u16 d3, q13, q11
1453 vqadd.u8 q14, q0, q2
1454 vqadd.u8 q15, q1, q3
1457 .macro pixman_composite_add_8_8_8_process_pixblock_tail
1460 /* TODO: expand macros and do better instructions scheduling */
1461 .macro pixman_composite_add_8_8_8_process_pixblock_tail_head
1462 pixman_composite_add_8_8_8_process_pixblock_tail
1463 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1464 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1467 cache_preload 32, 32
1468 pixman_composite_add_8_8_8_process_pixblock_head
1471 .macro pixman_composite_add_8_8_8_init
1474 .macro pixman_composite_add_8_8_8_cleanup
1477 generate_composite_function \
1478 pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
1479 FLAG_DST_READWRITE, \
1480 32, /* number of pixels, processed in a single block */ \
1481 5, /* prefetch distance */ \
1482 pixman_composite_add_8_8_8_init, \
1483 pixman_composite_add_8_8_8_cleanup, \
1484 pixman_composite_add_8_8_8_process_pixblock_head, \
1485 pixman_composite_add_8_8_8_process_pixblock_tail, \
1486 pixman_composite_add_8_8_8_process_pixblock_tail_head
1488 /******************************************************************************/
1490 .macro pixman_composite_add_8888_8888_8888_process_pixblock_head
1491 /* expecting source data in {d0, d1, d2, d3} */
1492 /* destination data in {d4, d5, d6, d7} */
1493 /* mask in {d24, d25, d26, d27} */
1494 vmull.u8 q8, d27, d0
1495 vmull.u8 q9, d27, d1
1496 vmull.u8 q10, d27, d2
1497 vmull.u8 q11, d27, d3
1498 vrshr.u16 q0, q8, #8
1499 vrshr.u16 q1, q9, #8
1500 vrshr.u16 q12, q10, #8
1501 vrshr.u16 q13, q11, #8
1502 vraddhn.u16 d0, q0, q8
1503 vraddhn.u16 d1, q1, q9
1504 vraddhn.u16 d2, q12, q10
1505 vraddhn.u16 d3, q13, q11
1506 vqadd.u8 q14, q0, q2
1507 vqadd.u8 q15, q1, q3
1510 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
1513 /* TODO: expand macros and do better instructions scheduling */
1514 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1515 pixman_composite_add_8888_8888_8888_process_pixblock_tail
1516 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1517 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1521 pixman_composite_add_8888_8888_8888_process_pixblock_head
1524 generate_composite_function \
1525 pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
1526 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1527 8, /* number of pixels, processed in a single block */ \
1528 10, /* prefetch distance */ \
1531 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1532 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1533 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1535 generate_composite_function_single_scanline \
1536 pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
1537 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1538 8, /* number of pixels, processed in a single block */ \
1541 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1542 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1543 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1545 /******************************************************************************/
1547 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
1548 /* expecting source data in {d0, d1, d2, d3} */
1549 /* destination data in {d4, d5, d6, d7} */
1550 /* solid mask is in d15 */
1553 vmull.u8 q8, d15, d3
1554 vmull.u8 q6, d15, d2
1555 vmull.u8 q5, d15, d1
1556 vmull.u8 q4, d15, d0
1557 vrshr.u16 q13, q8, #8
1558 vrshr.u16 q12, q6, #8
1559 vrshr.u16 q11, q5, #8
1560 vrshr.u16 q10, q4, #8
1561 vraddhn.u16 d3, q8, q13
1562 vraddhn.u16 d2, q6, q12
1563 vraddhn.u16 d1, q5, q11
1564 vraddhn.u16 d0, q4, q10
1565 vmvn.8 d24, d3 /* get inverted alpha */
1566 /* now do alpha blending */
1567 vmull.u8 q8, d24, d4
1568 vmull.u8 q9, d24, d5
1569 vmull.u8 q10, d24, d6
1570 vmull.u8 q11, d24, d7
1573 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
1574 vrshr.u16 q14, q8, #8
1575 vrshr.u16 q15, q9, #8
1576 vrshr.u16 q12, q10, #8
1577 vrshr.u16 q13, q11, #8
1578 vraddhn.u16 d28, q14, q8
1579 vraddhn.u16 d29, q15, q9
1580 vraddhn.u16 d30, q12, q10
1581 vraddhn.u16 d31, q13, q11
1584 /* TODO: expand macros and do better instructions scheduling */
1585 .macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
1586 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1587 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
1591 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
1592 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1595 generate_composite_function_single_scanline \
1596 pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
1597 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1598 8, /* number of pixels, processed in a single block */ \
1599 default_init_need_all_regs, \
1600 default_cleanup_need_all_regs, \
1601 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
1602 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
1603 pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
1604 28, /* dst_w_basereg */ \
1605 4, /* dst_r_basereg */ \
1606 0, /* src_basereg */ \
1607 12 /* mask_basereg */
1609 /******************************************************************************/
1611 .macro pixman_composite_over_8888_n_8888_process_pixblock_head
1612 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
1615 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail
1616 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
1617 vqadd.u8 q14, q0, q14
1618 vqadd.u8 q15, q1, q15
1621 /* TODO: expand macros and do better instructions scheduling */
1622 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
1623 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1624 pixman_composite_over_8888_n_8888_process_pixblock_tail
1627 pixman_composite_over_8888_n_8888_process_pixblock_head
1628 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1631 .macro pixman_composite_over_8888_n_8888_init
1634 vld1.32 {d15[0]}, [DUMMY]
1638 .macro pixman_composite_over_8888_n_8888_cleanup
1642 generate_composite_function \
1643 pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
1644 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1645 8, /* number of pixels, processed in a single block */ \
1646 5, /* prefetch distance */ \
1647 pixman_composite_over_8888_n_8888_init, \
1648 pixman_composite_over_8888_n_8888_cleanup, \
1649 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1650 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1651 pixman_composite_over_8888_n_8888_process_pixblock_tail_head
1653 /******************************************************************************/
1655 /* TODO: expand macros and do better instructions scheduling */
1656 .macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
1657 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1658 pixman_composite_over_8888_n_8888_process_pixblock_tail
1662 pixman_composite_over_8888_n_8888_process_pixblock_head
1663 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1666 generate_composite_function \
1667 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
1668 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1669 8, /* number of pixels, processed in a single block */ \
1670 5, /* prefetch distance */ \
1671 default_init_need_all_regs, \
1672 default_cleanup_need_all_regs, \
1673 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1674 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1675 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
1676 28, /* dst_w_basereg */ \
1677 4, /* dst_r_basereg */ \
1678 0, /* src_basereg */ \
1679 12 /* mask_basereg */
1681 generate_composite_function_single_scanline \
1682 pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
1683 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1684 8, /* number of pixels, processed in a single block */ \
1685 default_init_need_all_regs, \
1686 default_cleanup_need_all_regs, \
1687 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1688 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1689 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
1690 28, /* dst_w_basereg */ \
1691 4, /* dst_r_basereg */ \
1692 0, /* src_basereg */ \
1693 12 /* mask_basereg */
1695 /******************************************************************************/
1697 /* TODO: expand macros and do better instructions scheduling */
1698 .macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
1699 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1700 pixman_composite_over_8888_n_8888_process_pixblock_tail
1704 pixman_composite_over_8888_n_8888_process_pixblock_head
1705 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1708 generate_composite_function \
1709 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
1710 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1711 8, /* number of pixels, processed in a single block */ \
1712 5, /* prefetch distance */ \
1713 default_init_need_all_regs, \
1714 default_cleanup_need_all_regs, \
1715 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1716 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1717 pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
1718 28, /* dst_w_basereg */ \
1719 4, /* dst_r_basereg */ \
1720 0, /* src_basereg */ \
1721 15 /* mask_basereg */
1723 /******************************************************************************/
1725 .macro pixman_composite_src_0888_0888_process_pixblock_head
1728 .macro pixman_composite_src_0888_0888_process_pixblock_tail
1731 .macro pixman_composite_src_0888_0888_process_pixblock_tail_head
1732 vst3.8 {d0, d1, d2}, [DST_W]!
1737 generate_composite_function \
1738 pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
1739 FLAG_DST_WRITEONLY, \
1740 8, /* number of pixels, processed in a single block */ \
1741 10, /* prefetch distance */ \
1744 pixman_composite_src_0888_0888_process_pixblock_head, \
1745 pixman_composite_src_0888_0888_process_pixblock_tail, \
1746 pixman_composite_src_0888_0888_process_pixblock_tail_head, \
1747 0, /* dst_w_basereg */ \
1748 0, /* dst_r_basereg */ \
1749 0, /* src_basereg */ \
1750 0 /* mask_basereg */
1752 /******************************************************************************/
1754 .macro pixman_composite_src_0888_8888_rev_process_pixblock_head
1758 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
1761 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
1762 vst4.8 {d0, d1, d2, d3}, [DST_W]!
1768 .macro pixman_composite_src_0888_8888_rev_init
1772 generate_composite_function \
1773 pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
1774 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1775 8, /* number of pixels, processed in a single block */ \
1776 10, /* prefetch distance */ \
1777 pixman_composite_src_0888_8888_rev_init, \
1779 pixman_composite_src_0888_8888_rev_process_pixblock_head, \
1780 pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
1781 pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
1782 0, /* dst_w_basereg */ \
1783 0, /* dst_r_basereg */ \
1784 0, /* src_basereg */ \
1785 0 /* mask_basereg */
1787 /******************************************************************************/
1789 .macro pixman_composite_src_0888_0565_rev_process_pixblock_head
1794 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
1795 vshll.u8 q14, d0, #8
1796 vsri.u16 q14, q8, #5
1797 vsri.u16 q14, q9, #11
1800 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
1801 vshll.u8 q14, d0, #8
1803 vsri.u16 q14, q8, #5
1804 vsri.u16 q14, q9, #11
1806 vst1.16 {d28, d29}, [DST_W, :128]!
1810 generate_composite_function \
1811 pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
1812 FLAG_DST_WRITEONLY, \
1813 8, /* number of pixels, processed in a single block */ \
1814 10, /* prefetch distance */ \
1817 pixman_composite_src_0888_0565_rev_process_pixblock_head, \
1818 pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
1819 pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
1820 28, /* dst_w_basereg */ \
1821 0, /* dst_r_basereg */ \
1822 0, /* src_basereg */ \
1823 0 /* mask_basereg */
1825 /******************************************************************************/
1827 .macro pixman_composite_src_pixbuf_8888_process_pixblock_head
1830 vmull.u8 q10, d3, d2
1833 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
1834 vrshr.u16 q11, q8, #8
1836 vrshr.u16 q12, q9, #8
1837 vrshr.u16 q13, q10, #8
1838 vraddhn.u16 d30, q11, q8
1839 vraddhn.u16 d29, q12, q9
1840 vraddhn.u16 d28, q13, q10
1843 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
1844 vrshr.u16 q11, q8, #8
1846 vrshr.u16 q12, q9, #8
1847 vrshr.u16 q13, q10, #8
1849 vraddhn.u16 d30, q11, q8
1850 PF add PF_X, PF_X, #8
1852 PF addne PF_X, PF_X, #8
1853 PF subne PF_CTL, PF_CTL, #1
1854 vraddhn.u16 d29, q12, q9
1855 vraddhn.u16 d28, q13, q10
1858 vmull.u8 q10, d3, d2
1859 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1861 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
1862 PF subge PF_X, PF_X, ORIG_W
1863 PF subges PF_CTL, PF_CTL, #0x10
1864 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1867 generate_composite_function \
1868 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
1869 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1870 8, /* number of pixels, processed in a single block */ \
1871 10, /* prefetch distance */ \
1874 pixman_composite_src_pixbuf_8888_process_pixblock_head, \
1875 pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
1876 pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
1877 28, /* dst_w_basereg */ \
1878 0, /* dst_r_basereg */ \
1879 0, /* src_basereg */ \
1880 0 /* mask_basereg */
1882 /******************************************************************************/
1884 .macro pixman_composite_over_0565_8_0565_process_pixblock_head
1885 /* mask is in d15 */
1886 convert_0565_to_x888 q4, d2, d1, d0
1887 convert_0565_to_x888 q5, d6, d5, d4
1888 /* source pixel data is in {d0, d1, d2, XX} */
1889 /* destination pixel data is in {d4, d5, d6, XX} */
1891 vmull.u8 q6, d15, d2
1892 vmull.u8 q5, d15, d1
1893 vmull.u8 q4, d15, d0
1896 vmull.u8 q13, d7, d6
1897 vrshr.u16 q12, q6, #8
1898 vrshr.u16 q11, q5, #8
1899 vrshr.u16 q10, q4, #8
1900 vraddhn.u16 d2, q6, q12
1901 vraddhn.u16 d1, q5, q11
1902 vraddhn.u16 d0, q4, q10
1905 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail
1906 vrshr.u16 q14, q8, #8
1907 vrshr.u16 q15, q9, #8
1908 vrshr.u16 q12, q13, #8
1909 vraddhn.u16 d28, q14, q8
1910 vraddhn.u16 d29, q15, q9
1911 vraddhn.u16 d30, q12, q13
1912 vqadd.u8 q0, q0, q14
1913 vqadd.u8 q1, q1, q15
1914 /* 32bpp result is in {d0, d1, d2, XX} */
1915 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
1918 /* TODO: expand macros and do better instructions scheduling */
1919 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
1921 pixman_composite_over_0565_8_0565_process_pixblock_tail
1923 vld1.16 {d10, d11}, [DST_R, :128]!
1925 pixman_composite_over_0565_8_0565_process_pixblock_head
1926 vst1.16 {d28, d29}, [DST_W, :128]!
1929 generate_composite_function \
1930 pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
1931 FLAG_DST_READWRITE, \
1932 8, /* number of pixels, processed in a single block */ \
1933 5, /* prefetch distance */ \
1934 default_init_need_all_regs, \
1935 default_cleanup_need_all_regs, \
1936 pixman_composite_over_0565_8_0565_process_pixblock_head, \
1937 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
1938 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
1939 28, /* dst_w_basereg */ \
1940 10, /* dst_r_basereg */ \
1941 8, /* src_basereg */ \
1942 15 /* mask_basereg */
1944 /******************************************************************************/
1946 .macro pixman_composite_add_0565_8_0565_process_pixblock_head
1947 /* mask is in d15 */
1948 convert_0565_to_x888 q4, d2, d1, d0
1949 convert_0565_to_x888 q5, d6, d5, d4
1950 /* source pixel data is in {d0, d1, d2, XX} */
1951 /* destination pixel data is in {d4, d5, d6, XX} */
1952 vmull.u8 q6, d15, d2
1953 vmull.u8 q5, d15, d1
1954 vmull.u8 q4, d15, d0
1955 vrshr.u16 q12, q6, #8
1956 vrshr.u16 q11, q5, #8
1957 vrshr.u16 q10, q4, #8
1958 vraddhn.u16 d2, q6, q12
1959 vraddhn.u16 d1, q5, q11
1960 vraddhn.u16 d0, q4, q10
1963 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail
1966 /* 32bpp result is in {d0, d1, d2, XX} */
1967 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
1970 /* TODO: expand macros and do better instructions scheduling */
1971 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
1973 pixman_composite_add_0565_8_0565_process_pixblock_tail
1975 vld1.16 {d10, d11}, [DST_R, :128]!
1977 pixman_composite_add_0565_8_0565_process_pixblock_head
1978 vst1.16 {d28, d29}, [DST_W, :128]!
1981 generate_composite_function \
1982 pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
1983 FLAG_DST_READWRITE, \
1984 8, /* number of pixels, processed in a single block */ \
1985 5, /* prefetch distance */ \
1986 default_init_need_all_regs, \
1987 default_cleanup_need_all_regs, \
1988 pixman_composite_add_0565_8_0565_process_pixblock_head, \
1989 pixman_composite_add_0565_8_0565_process_pixblock_tail, \
1990 pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
1991 28, /* dst_w_basereg */ \
1992 10, /* dst_r_basereg */ \
1993 8, /* src_basereg */ \
1994 15 /* mask_basereg */
1996 /******************************************************************************/
1998 .macro pixman_composite_out_reverse_8_0565_process_pixblock_head
1999 /* mask is in d15 */
2000 convert_0565_to_x888 q5, d6, d5, d4
2001 /* destination pixel data is in {d4, d5, d6, xx} */
2002 vmvn.8 d24, d15 /* get inverted alpha */
2003 /* now do alpha blending */
2004 vmull.u8 q8, d24, d4
2005 vmull.u8 q9, d24, d5
2006 vmull.u8 q10, d24, d6
2009 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
2010 vrshr.u16 q14, q8, #8
2011 vrshr.u16 q15, q9, #8
2012 vrshr.u16 q12, q10, #8
2013 vraddhn.u16 d0, q14, q8
2014 vraddhn.u16 d1, q15, q9
2015 vraddhn.u16 d2, q12, q10
2016 /* 32bpp result is in {d0, d1, d2, XX} */
2017 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2020 /* TODO: expand macros and do better instructions scheduling */
2021 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
2023 pixman_composite_out_reverse_8_0565_process_pixblock_tail
2024 vld1.16 {d10, d11}, [DST_R, :128]!
2026 pixman_composite_out_reverse_8_0565_process_pixblock_head
2027 vst1.16 {d28, d29}, [DST_W, :128]!
2030 generate_composite_function \
2031 pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
2032 FLAG_DST_READWRITE, \
2033 8, /* number of pixels, processed in a single block */ \
2034 5, /* prefetch distance */ \
2035 default_init_need_all_regs, \
2036 default_cleanup_need_all_regs, \
2037 pixman_composite_out_reverse_8_0565_process_pixblock_head, \
2038 pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
2039 pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
2040 28, /* dst_w_basereg */ \
2041 10, /* dst_r_basereg */ \
2042 15, /* src_basereg */ \
2043 0 /* mask_basereg */
2045 /******************************************************************************/
2047 generate_composite_function_nearest_scanline \
2048 pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
2049 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2050 8, /* number of pixels, processed in a single block */ \
2053 pixman_composite_over_8888_8888_process_pixblock_head, \
2054 pixman_composite_over_8888_8888_process_pixblock_tail, \
2055 pixman_composite_over_8888_8888_process_pixblock_tail_head
2057 generate_composite_function_nearest_scanline \
2058 pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
2059 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2060 8, /* number of pixels, processed in a single block */ \
2063 pixman_composite_over_8888_0565_process_pixblock_head, \
2064 pixman_composite_over_8888_0565_process_pixblock_tail, \
2065 pixman_composite_over_8888_0565_process_pixblock_tail_head, \
2066 28, /* dst_w_basereg */ \
2067 4, /* dst_r_basereg */ \
2068 0, /* src_basereg */ \
2069 24 /* mask_basereg */
2071 generate_composite_function_nearest_scanline \
2072 pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
2073 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2074 8, /* number of pixels, processed in a single block */ \
2077 pixman_composite_src_8888_0565_process_pixblock_head, \
2078 pixman_composite_src_8888_0565_process_pixblock_tail, \
2079 pixman_composite_src_8888_0565_process_pixblock_tail_head
2081 generate_composite_function_nearest_scanline \
2082 pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
2083 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2084 8, /* number of pixels, processed in a single block */ \
2087 pixman_composite_src_0565_8888_process_pixblock_head, \
2088 pixman_composite_src_0565_8888_process_pixblock_tail, \
2089 pixman_composite_src_0565_8888_process_pixblock_tail_head