2 * Copyright © 2009 Nokia Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
27 * This file contains implementations of NEON optimized pixel processing
28 * functions. There is no full and detailed tutorial, but some functions
29 * (those which are exposing some new or interesting features) are
30 * extensively commented and can be used as examples.
32 * You may want to have a look at the comments for following functions:
33 * - pixman_composite_over_8888_0565_asm_neon
34 * - pixman_composite_over_n_8_0565_asm_neon
37 /* Prevent the stack from becoming executable for no reason... */
38 #if defined(__linux__) && defined(__ELF__)
39 .section .note.GNU-stack,"",%progbits
46 .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
47 .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
51 #include "pixman-arm-neon-asm.h"
53 /* Global configuration options and preferences */
56 * The code can optionally make use of unaligned memory accesses to improve
57 * performance of handling leading/trailing pixels for each scanline.
58 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
59 * example in linux if unaligned memory accesses are not configured to
60 * generate.exceptions.
62 .set RESPECT_STRICT_ALIGNMENT, 1
65 * Set default prefetch type. There is a choice between the following options:
67 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
68 * as NOP to workaround some HW bugs or for whatever other reason)
70 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
71 * advanced prefetch intruduces heavy overhead)
73 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
74 * which can run ARM and NEON instructions simultaneously so that extra ARM
75 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
77 * Note: some types of function can't support advanced prefetch and fallback
78 * to simple one (those which handle 24bpp pixels)
80 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
82 /* Prefetch distance in pixels for simple prefetch */
83 .set PREFETCH_DISTANCE_SIMPLE, 64
86 * Implementation of pixman_composite_over_8888_0565_asm_neon
88 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
89 * performs OVER compositing operation. Function fast_composite_over_8888_0565
90 * from pixman-fast-path.c does the same in C and can be used as a reference.
92 * First we need to have some NEON assembly code which can do the actual
93 * operation on the pixels and provide it to the template macro.
95 * Template macro quite conveniently takes care of emitting all the necessary
96 * code for memory reading and writing (including quite tricky cases of
97 * handling unaligned leading/trailing pixels), so we only need to deal with
98 * the data in NEON registers.
100 * NEON registers allocation in general is recommented to be the following:
101 * d0, d1, d2, d3 - contain loaded source pixel data
102 * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed)
103 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
104 * d28, d29, d30, d31 - place for storing the result (destination pixels)
106 * As can be seen above, four 64-bit NEON registers are used for keeping
107 * intermediate pixel data and up to 8 pixels can be processed in one step
108 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
110 * This particular function uses the following registers allocation:
111 * d0, d1, d2, d3 - contain loaded source pixel data
112 * d4, d5 - contain loaded destination pixels (they are needed)
113 * d28, d29 - place for storing the result (destination pixels)
117 * Step one. We need to have some code to do some arithmetics on pixel data.
118 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
119 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
120 * perform all the needed calculations and write the result to {d28, d29}.
121 * The rationale for having two macros and not just one will be explained
122 * later. In practice, any single monolitic function which does the work can
123 * be split into two parts in any arbitrary way without affecting correctness.
125 * There is one special trick here too. Common template macro can optionally
126 * make our life a bit easier by doing R, G, B, A color components
127 * deinterleaving for 32bpp pixel formats (and this feature is used in
128 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
129 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
130 * actually use d0 register for blue channel (a vector of eight 8-bit
131 * values), d1 register for green, d2 for red and d3 for alpha. This
132 * simple conversion can be also done with a few NEON instructions:
134 * Packed to planar conversion:
140 * Planar to packed conversion:
146 * But pixel can be loaded directly in planar format using VLD4.8 NEON
147 * instruction. It is 1 cycle slower than VLD1.32, so this is not always
148 * desirable, that's why deinterleaving is optional.
150 * But anyway, here is the code:
152 .macro pixman_composite_over_8888_0565_process_pixblock_head
153 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
154 and put data into d6 - red, d7 - green, d30 - blue */
159 vmvn.8 d3, d3 /* invert source alpha */
161 vshrn.u16 d30, q2, #2
162 /* now do alpha blending, storing results in 8-bit planar format
163 into d16 - red, d19 - green, d18 - blue */
166 vmull.u8 q12, d3, d30
167 vrshr.u16 q13, q10, #8
168 vrshr.u16 q3, q11, #8
169 vrshr.u16 q15, q12, #8
170 vraddhn.u16 d20, q10, q13
171 vraddhn.u16 d23, q11, q3
172 vraddhn.u16 d22, q12, q15
175 .macro pixman_composite_over_8888_0565_process_pixblock_tail
176 /* ... continue alpha blending */
177 vqadd.u8 d16, d2, d20
179 /* convert the result to r5g6b5 and store it into {d28, d29} */
180 vshll.u8 q14, d16, #8
184 vsri.u16 q14, q9, #11
188 * OK, now we got almost everything that we need. Using the above two
189 * macros, the work can be done right. But now we want to optimize
190 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
191 * a lot from good code scheduling and software pipelining.
193 * Let's construct some code, which will run in the core main loop.
194 * Some pseudo-code of the main loop will look like this:
202 * It may look a bit weird, but this setup allows to hide instruction
203 * latencies better and also utilize dual-issue capability more
204 * efficiently (make pairs of load-store and ALU instructions).
206 * So what we need now is a '*_tail_head' macro, which will be used
207 * in the core main loop. A trivial straightforward implementation
208 * of this macro would look like this:
210 * pixman_composite_over_8888_0565_process_pixblock_tail
211 * vst1.16 {d28, d29}, [DST_W, :128]!
212 * vld1.16 {d4, d5}, [DST_R, :128]!
213 * vld4.32 {d0, d1, d2, d3}, [SRC]!
214 * pixman_composite_over_8888_0565_process_pixblock_head
217 * Now it also got some VLD/VST instructions. We simply can't move from
218 * processing one block of pixels to the other one with just arithmetics.
219 * The previously processed data needs to be written to memory and new
220 * data needs to be fetched. Fortunately, this main loop does not deal
221 * with partial leading/trailing pixels and can load/store a full block
222 * of pixels in a bulk. Additionally, destination buffer is already
223 * 16 bytes aligned here (which is good for performance).
225 * New things here are DST_R, DST_W, SRC and MASK identifiers. These
226 * are the aliases for ARM registers which are used as pointers for
227 * accessing data. We maintain separate pointers for reading and writing
228 * destination buffer (DST_R and DST_W).
230 * Another new thing is 'cache_preload' macro. It is used for prefetching
231 * data into CPU L2 cache and improve performance when dealing with large
232 * images which are far larger than cache size. It uses one argument
233 * (actually two, but they need to be the same here) - number of pixels
234 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
235 * details about this macro. Moreover, if good performance is needed
236 * the code from this macro needs to be copied into '*_tail_head' macro
237 * and mixed with the rest of code for optimal instructions scheduling.
238 * We are actually doing it below.
240 * Now after all the explanations, here is the optimized code.
241 * Different instruction streams (originaling from '*_head', '*_tail'
242 * and 'cache_preload' macro) use different indentation levels for
243 * better readability. Actually taking the code from one of these
244 * indentation levels and ignoring a few VLD/VST instructions would
245 * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
251 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
252 vqadd.u8 d16, d2, d20
253 vld1.16 {d4, d5}, [DST_R, :128]!
259 vshll.u8 q14, d16, #8
260 PF add PF_X, PF_X, #8
264 PF addne PF_X, PF_X, #8
266 PF subne PF_CTL, PF_CTL, #1
268 vshrn.u16 d30, q2, #2
270 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
272 vmull.u8 q12, d3, d30
273 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
277 vrshr.u16 q13, q10, #8
278 PF subge PF_X, PF_X, ORIG_W
279 vrshr.u16 q3, q11, #8
280 vrshr.u16 q15, q12, #8
281 PF subges PF_CTL, PF_CTL, #0x10
282 vsri.u16 q14, q9, #11
283 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
284 vraddhn.u16 d20, q10, q13
285 vraddhn.u16 d23, q11, q3
286 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
287 vraddhn.u16 d22, q12, q15
288 vst1.16 {d28, d29}, [DST_W, :128]!
293 /* If we did not care much about the performance, we would just use this... */
294 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
295 pixman_composite_over_8888_0565_process_pixblock_tail
296 vst1.16 {d28, d29}, [DST_W, :128]!
297 vld1.16 {d4, d5}, [DST_R, :128]!
299 pixman_composite_over_8888_0565_process_pixblock_head
306 * And now the final part. We are using 'generate_composite_function' macro
307 * to put all the stuff together. We are specifying the name of the function
308 * which we want to get, number of bits per pixel for the source, mask and
309 * destination (0 if unused, like mask in this case). Next come some bit
311 * FLAG_DST_READWRITE - tells that the destination buffer is both read
312 * and written, for write-only buffer we would use
313 * FLAG_DST_WRITEONLY flag instead
314 * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
315 * and separate color channels for 32bpp format.
316 * The next things are:
317 * - the number of pixels processed per iteration (8 in this case, because
318 * that's the maximum what can fit into four 64-bit NEON registers).
319 * - prefetch distance, measured in pixel blocks. In this case it is 5 times
320 * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
321 * prefetch distance can be selected by running some benchmarks.
323 * After that we specify some macros, these are 'default_init',
324 * 'default_cleanup' here which are empty (but it is possible to have custom
325 * init/cleanup macros to be able to save/restore some extra NEON registers
326 * like d8-d15 or do anything else) followed by
327 * 'pixman_composite_over_8888_0565_process_pixblock_head',
328 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
329 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
330 * which we got implemented above.
332 * The last part is the NEON registers allocation scheme.
334 generate_composite_function \
335 pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
336 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
337 8, /* number of pixels, processed in a single block */ \
338 5, /* prefetch distance */ \
341 pixman_composite_over_8888_0565_process_pixblock_head, \
342 pixman_composite_over_8888_0565_process_pixblock_tail, \
343 pixman_composite_over_8888_0565_process_pixblock_tail_head, \
344 28, /* dst_w_basereg */ \
345 4, /* dst_r_basereg */ \
346 0, /* src_basereg */ \
347 24 /* mask_basereg */
349 /******************************************************************************/
351 .macro pixman_composite_over_n_0565_process_pixblock_head
352 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
353 and put data into d6 - red, d7 - green, d30 - blue */
359 vshrn.u16 d30, q2, #2
360 /* now do alpha blending, storing results in 8-bit planar format
361 into d16 - red, d19 - green, d18 - blue */
364 vmull.u8 q12, d3, d30
365 vrshr.u16 q13, q10, #8
366 vrshr.u16 q3, q11, #8
367 vrshr.u16 q15, q12, #8
368 vraddhn.u16 d20, q10, q13
369 vraddhn.u16 d23, q11, q3
370 vraddhn.u16 d22, q12, q15
373 .macro pixman_composite_over_n_0565_process_pixblock_tail
374 /* ... continue alpha blending */
375 vqadd.u8 d16, d2, d20
377 /* convert the result to r5g6b5 and store it into {d28, d29} */
378 vshll.u8 q14, d16, #8
382 vsri.u16 q14, q9, #11
385 /* TODO: expand macros and do better instructions scheduling */
386 .macro pixman_composite_over_n_0565_process_pixblock_tail_head
387 pixman_composite_over_n_0565_process_pixblock_tail
388 vld1.16 {d4, d5}, [DST_R, :128]!
389 vst1.16 {d28, d29}, [DST_W, :128]!
390 pixman_composite_over_n_0565_process_pixblock_head
394 .macro pixman_composite_over_n_0565_init
395 add DUMMY, sp, #ARGS_STACK_OFFSET
396 vld1.32 {d3[0]}, [DUMMY]
401 vmvn.8 d3, d3 /* invert source alpha */
404 generate_composite_function \
405 pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
406 FLAG_DST_READWRITE, \
407 8, /* number of pixels, processed in a single block */ \
408 5, /* prefetch distance */ \
409 pixman_composite_over_n_0565_init, \
411 pixman_composite_over_n_0565_process_pixblock_head, \
412 pixman_composite_over_n_0565_process_pixblock_tail, \
413 pixman_composite_over_n_0565_process_pixblock_tail_head, \
414 28, /* dst_w_basereg */ \
415 4, /* dst_r_basereg */ \
416 0, /* src_basereg */ \
417 24 /* mask_basereg */
419 /******************************************************************************/
421 .macro pixman_composite_src_8888_0565_process_pixblock_head
427 .macro pixman_composite_src_8888_0565_process_pixblock_tail
429 vsri.u16 q14, q9, #11
432 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
434 PF add PF_X, PF_X, #8
437 PF addne PF_X, PF_X, #8
438 PF subne PF_CTL, PF_CTL, #1
439 vsri.u16 q14, q9, #11
441 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
443 vst1.16 {d28, d29}, [DST_W, :128]!
444 PF subge PF_X, PF_X, ORIG_W
445 PF subges PF_CTL, PF_CTL, #0x10
447 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
451 generate_composite_function \
452 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
453 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
454 8, /* number of pixels, processed in a single block */ \
455 10, /* prefetch distance */ \
458 pixman_composite_src_8888_0565_process_pixblock_head, \
459 pixman_composite_src_8888_0565_process_pixblock_tail, \
460 pixman_composite_src_8888_0565_process_pixblock_tail_head
462 /******************************************************************************/
464 .macro pixman_composite_src_0565_8888_process_pixblock_head
465 vshrn.u16 d30, q0, #8
466 vshrn.u16 d29, q0, #3
471 vshrn.u16 d28, q0, #2
474 .macro pixman_composite_src_0565_8888_process_pixblock_tail
477 /* TODO: expand macros and do better instructions scheduling */
478 .macro pixman_composite_src_0565_8888_process_pixblock_tail_head
479 pixman_composite_src_0565_8888_process_pixblock_tail
480 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
482 pixman_composite_src_0565_8888_process_pixblock_head
486 generate_composite_function \
487 pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
488 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
489 8, /* number of pixels, processed in a single block */ \
490 10, /* prefetch distance */ \
493 pixman_composite_src_0565_8888_process_pixblock_head, \
494 pixman_composite_src_0565_8888_process_pixblock_tail, \
495 pixman_composite_src_0565_8888_process_pixblock_tail_head
497 /******************************************************************************/
499 .macro pixman_composite_add_8_8_process_pixblock_head
504 .macro pixman_composite_add_8_8_process_pixblock_tail
507 .macro pixman_composite_add_8_8_process_pixblock_tail_head
509 PF add PF_X, PF_X, #32
511 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
512 PF addne PF_X, PF_X, #32
513 PF subne PF_CTL, PF_CTL, #1
514 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
516 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
517 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
518 PF subge PF_X, PF_X, ORIG_W
519 PF subges PF_CTL, PF_CTL, #0x10
521 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
522 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
526 generate_composite_function \
527 pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
528 FLAG_DST_READWRITE, \
529 32, /* number of pixels, processed in a single block */ \
530 10, /* prefetch distance */ \
533 pixman_composite_add_8_8_process_pixblock_head, \
534 pixman_composite_add_8_8_process_pixblock_tail, \
535 pixman_composite_add_8_8_process_pixblock_tail_head
537 /******************************************************************************/
539 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
541 PF add PF_X, PF_X, #8
543 vld1.32 {d4, d5, d6, d7}, [DST_R, :128]!
544 PF addne PF_X, PF_X, #8
545 PF subne PF_CTL, PF_CTL, #1
546 vst1.32 {d28, d29, d30, d31}, [DST_W, :128]!
548 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
549 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
550 PF subge PF_X, PF_X, ORIG_W
551 PF subges PF_CTL, PF_CTL, #0x10
553 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
554 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
558 generate_composite_function \
559 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
560 FLAG_DST_READWRITE, \
561 8, /* number of pixels, processed in a single block */ \
562 10, /* prefetch distance */ \
565 pixman_composite_add_8_8_process_pixblock_head, \
566 pixman_composite_add_8_8_process_pixblock_tail, \
567 pixman_composite_add_8888_8888_process_pixblock_tail_head
569 generate_composite_function_single_scanline \
570 pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
571 FLAG_DST_READWRITE, \
572 8, /* number of pixels, processed in a single block */ \
575 pixman_composite_add_8_8_process_pixblock_head, \
576 pixman_composite_add_8_8_process_pixblock_tail, \
577 pixman_composite_add_8888_8888_process_pixblock_tail_head
579 /******************************************************************************/
581 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
582 vmvn.8 d24, d3 /* get inverted alpha */
583 /* do alpha blending */
586 vmull.u8 q10, d24, d6
587 vmull.u8 q11, d24, d7
590 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
591 vrshr.u16 q14, q8, #8
592 vrshr.u16 q15, q9, #8
593 vrshr.u16 q12, q10, #8
594 vrshr.u16 q13, q11, #8
595 vraddhn.u16 d28, q14, q8
596 vraddhn.u16 d29, q15, q9
597 vraddhn.u16 d30, q12, q10
598 vraddhn.u16 d31, q13, q11
601 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
602 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
603 vrshr.u16 q14, q8, #8
604 PF add PF_X, PF_X, #8
606 vrshr.u16 q15, q9, #8
607 vrshr.u16 q12, q10, #8
608 vrshr.u16 q13, q11, #8
609 PF addne PF_X, PF_X, #8
610 PF subne PF_CTL, PF_CTL, #1
611 vraddhn.u16 d28, q14, q8
612 vraddhn.u16 d29, q15, q9
614 vraddhn.u16 d30, q12, q10
615 vraddhn.u16 d31, q13, q11
617 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
619 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
620 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
621 PF subge PF_X, PF_X, ORIG_W
623 PF subges PF_CTL, PF_CTL, #0x10
625 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
626 vmull.u8 q10, d22, d6
627 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
628 vmull.u8 q11, d22, d7
631 generate_composite_function_single_scanline \
632 pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
633 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
634 8, /* number of pixels, processed in a single block */ \
637 pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
638 pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
639 pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
641 /******************************************************************************/
643 .macro pixman_composite_over_8888_8888_process_pixblock_head
644 pixman_composite_out_reverse_8888_8888_process_pixblock_head
647 .macro pixman_composite_over_8888_8888_process_pixblock_tail
648 pixman_composite_out_reverse_8888_8888_process_pixblock_tail
649 vqadd.u8 q14, q0, q14
650 vqadd.u8 q15, q1, q15
653 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
654 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
655 vrshr.u16 q14, q8, #8
656 PF add PF_X, PF_X, #8
658 vrshr.u16 q15, q9, #8
659 vrshr.u16 q12, q10, #8
660 vrshr.u16 q13, q11, #8
661 PF addne PF_X, PF_X, #8
662 PF subne PF_CTL, PF_CTL, #1
663 vraddhn.u16 d28, q14, q8
664 vraddhn.u16 d29, q15, q9
666 vraddhn.u16 d30, q12, q10
667 vraddhn.u16 d31, q13, q11
668 vqadd.u8 q14, q0, q14
669 vqadd.u8 q15, q1, q15
671 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
673 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
674 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
675 PF subge PF_X, PF_X, ORIG_W
677 PF subges PF_CTL, PF_CTL, #0x10
679 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
680 vmull.u8 q10, d22, d6
681 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
682 vmull.u8 q11, d22, d7
685 generate_composite_function \
686 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
687 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
688 8, /* number of pixels, processed in a single block */ \
689 5, /* prefetch distance */ \
692 pixman_composite_over_8888_8888_process_pixblock_head, \
693 pixman_composite_over_8888_8888_process_pixblock_tail, \
694 pixman_composite_over_8888_8888_process_pixblock_tail_head
696 generate_composite_function_single_scanline \
697 pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
698 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
699 8, /* number of pixels, processed in a single block */ \
702 pixman_composite_over_8888_8888_process_pixblock_head, \
703 pixman_composite_over_8888_8888_process_pixblock_tail, \
704 pixman_composite_over_8888_8888_process_pixblock_tail_head
706 /******************************************************************************/
708 /* TODO: expand macros and do better instructions scheduling */
709 .macro pixman_composite_over_n_8888_process_pixblock_tail_head
710 pixman_composite_over_8888_8888_process_pixblock_tail
711 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
712 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
713 pixman_composite_over_8888_8888_process_pixblock_head
717 .macro pixman_composite_over_n_8888_init
718 add DUMMY, sp, #ARGS_STACK_OFFSET
719 vld1.32 {d3[0]}, [DUMMY]
726 generate_composite_function \
727 pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
728 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
729 8, /* number of pixels, processed in a single block */ \
730 5, /* prefetch distance */ \
731 pixman_composite_over_n_8888_init, \
733 pixman_composite_over_8888_8888_process_pixblock_head, \
734 pixman_composite_over_8888_8888_process_pixblock_tail, \
735 pixman_composite_over_n_8888_process_pixblock_tail_head
737 /******************************************************************************/
739 .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
740 vrshr.u16 q14, q8, #8
741 PF add PF_X, PF_X, #8
743 vrshr.u16 q15, q9, #8
744 vrshr.u16 q12, q10, #8
745 vrshr.u16 q13, q11, #8
746 PF addne PF_X, PF_X, #8
747 PF subne PF_CTL, PF_CTL, #1
748 vraddhn.u16 d28, q14, q8
749 vraddhn.u16 d29, q15, q9
751 vraddhn.u16 d30, q12, q10
752 vraddhn.u16 d31, q13, q11
753 vqadd.u8 q14, q0, q14
754 vqadd.u8 q15, q1, q15
755 vld4.8 {d0, d1, d2, d3}, [DST_R, :128]!
757 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
758 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
759 PF subge PF_X, PF_X, ORIG_W
761 PF subges PF_CTL, PF_CTL, #0x10
763 vmull.u8 q10, d22, d6
764 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
765 vmull.u8 q11, d22, d7
768 .macro pixman_composite_over_reverse_n_8888_init
769 add DUMMY, sp, #ARGS_STACK_OFFSET
770 vld1.32 {d7[0]}, [DUMMY]
777 generate_composite_function \
778 pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
779 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
780 8, /* number of pixels, processed in a single block */ \
781 5, /* prefetch distance */ \
782 pixman_composite_over_reverse_n_8888_init, \
784 pixman_composite_over_8888_8888_process_pixblock_head, \
785 pixman_composite_over_8888_8888_process_pixblock_tail, \
786 pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
787 28, /* dst_w_basereg */ \
788 0, /* dst_r_basereg */ \
789 4, /* src_basereg */ \
790 24 /* mask_basereg */
792 /******************************************************************************/
794 .macro pixman_composite_over_8888_8_0565_process_pixblock_head
795 vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */
797 vmull.u8 q6, d24, d10
798 vmull.u8 q7, d24, d11
799 vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */
802 vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */
804 vrshr.u16 q10, q6, #8
805 vrshr.u16 q11, q7, #8
806 vraddhn.u16 d0, q0, q8
807 vraddhn.u16 d1, q1, q9
808 vraddhn.u16 d2, q6, q10
809 vraddhn.u16 d3, q7, q11
810 vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */
813 vshrn.u16 d30, q2, #2
814 vmull.u8 q8, d3, d6 /* now do alpha blending */
816 vmull.u8 q10, d3, d30
819 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail
820 /* 3 cycle bubble (after vmull.u8) */
821 vrshr.u16 q13, q8, #8
822 vrshr.u16 q11, q9, #8
823 vrshr.u16 q15, q10, #8
824 vraddhn.u16 d16, q8, q13
825 vraddhn.u16 d27, q9, q11
826 vraddhn.u16 d26, q10, q15
827 vqadd.u8 d16, d2, d16
830 vshll.u8 q14, d16, #8 /* convert to 16bpp */
835 vsri.u16 q14, q9, #11
838 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
839 vld1.16 {d4, d5}, [DST_R, :128]!
844 vmull.u8 q6, d24, d10
845 vrshr.u16 q13, q8, #8
846 vrshr.u16 q11, q9, #8
847 vrshr.u16 q15, q10, #8
848 vraddhn.u16 d16, q8, q13
849 vraddhn.u16 d27, q9, q11
850 vraddhn.u16 d26, q10, q15
851 vqadd.u8 d16, d2, d16
854 vshll.u8 q14, d16, #8
859 vmull.u8 q7, d24, d11
860 vsri.u16 q14, q9, #11
867 vrshr.u16 q10, q6, #8
868 vrshr.u16 q11, q7, #8
869 vraddhn.u16 d0, q0, q8
870 vraddhn.u16 d1, q1, q9
871 vraddhn.u16 d2, q6, q10
872 vraddhn.u16 d3, q7, q11
876 vshrn.u16 d30, q2, #2
877 vst1.16 {d28, d29}, [DST_W, :128]!
880 vmull.u8 q10, d3, d30
883 generate_composite_function \
884 pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
885 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
886 8, /* number of pixels, processed in a single block */ \
887 5, /* prefetch distance */ \
888 default_init_need_all_regs, \
889 default_cleanup_need_all_regs, \
890 pixman_composite_over_8888_8_0565_process_pixblock_head, \
891 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
892 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
893 28, /* dst_w_basereg */ \
894 4, /* dst_r_basereg */ \
895 8, /* src_basereg */ \
896 24 /* mask_basereg */
898 /******************************************************************************/
901 * This function needs a special initialization of solid mask.
902 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
903 * offset, split into color components and replicated in d8-d11
904 * registers. Additionally, this function needs all the NEON registers,
905 * so it has to save d8-d15 registers which are callee saved according
906 * to ABI. These registers are restored from 'cleanup' macro. All the
907 * other NEON registers are caller saved, so can be clobbered freely
908 * without introducing any problems.
910 .macro pixman_composite_over_n_8_0565_init
911 add DUMMY, sp, #ARGS_STACK_OFFSET
913 vld1.32 {d11[0]}, [DUMMY]
920 .macro pixman_composite_over_n_8_0565_cleanup
924 generate_composite_function \
925 pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
926 FLAG_DST_READWRITE, \
927 8, /* number of pixels, processed in a single block */ \
928 5, /* prefetch distance */ \
929 pixman_composite_over_n_8_0565_init, \
930 pixman_composite_over_n_8_0565_cleanup, \
931 pixman_composite_over_8888_8_0565_process_pixblock_head, \
932 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
933 pixman_composite_over_8888_8_0565_process_pixblock_tail_head
935 /******************************************************************************/
937 .macro pixman_composite_over_8888_n_0565_init
938 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
940 vld1.32 {d24[0]}, [DUMMY]
944 .macro pixman_composite_over_8888_n_0565_cleanup
948 generate_composite_function \
949 pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
950 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
951 8, /* number of pixels, processed in a single block */ \
952 5, /* prefetch distance */ \
953 pixman_composite_over_8888_n_0565_init, \
954 pixman_composite_over_8888_n_0565_cleanup, \
955 pixman_composite_over_8888_8_0565_process_pixblock_head, \
956 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
957 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
958 28, /* dst_w_basereg */ \
959 4, /* dst_r_basereg */ \
960 8, /* src_basereg */ \
961 24 /* mask_basereg */
963 /******************************************************************************/
965 .macro pixman_composite_src_0565_0565_process_pixblock_head
968 .macro pixman_composite_src_0565_0565_process_pixblock_tail
971 .macro pixman_composite_src_0565_0565_process_pixblock_tail_head
972 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
977 generate_composite_function \
978 pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
979 FLAG_DST_WRITEONLY, \
980 16, /* number of pixels, processed in a single block */ \
981 10, /* prefetch distance */ \
984 pixman_composite_src_0565_0565_process_pixblock_head, \
985 pixman_composite_src_0565_0565_process_pixblock_tail, \
986 pixman_composite_src_0565_0565_process_pixblock_tail_head, \
987 0, /* dst_w_basereg */ \
988 0, /* dst_r_basereg */ \
989 0, /* src_basereg */ \
992 /******************************************************************************/
994 .macro pixman_composite_src_n_8_process_pixblock_head
997 .macro pixman_composite_src_n_8_process_pixblock_tail
1000 .macro pixman_composite_src_n_8_process_pixblock_tail_head
1001 vst1.8 {d0, d1, d2, d3}, [DST_W, :128]!
1004 .macro pixman_composite_src_n_8_init
1005 add DUMMY, sp, #ARGS_STACK_OFFSET
1006 vld1.32 {d0[0]}, [DUMMY]
1008 vsli.u64 d0, d0, #16
1009 vsli.u64 d0, d0, #32
1014 .macro pixman_composite_src_n_8_cleanup
1017 generate_composite_function \
1018 pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
1019 FLAG_DST_WRITEONLY, \
1020 32, /* number of pixels, processed in a single block */ \
1021 0, /* prefetch distance */ \
1022 pixman_composite_src_n_8_init, \
1023 pixman_composite_src_n_8_cleanup, \
1024 pixman_composite_src_n_8_process_pixblock_head, \
1025 pixman_composite_src_n_8_process_pixblock_tail, \
1026 pixman_composite_src_n_8_process_pixblock_tail_head, \
1027 0, /* dst_w_basereg */ \
1028 0, /* dst_r_basereg */ \
1029 0, /* src_basereg */ \
1030 0 /* mask_basereg */
1032 /******************************************************************************/
1034 .macro pixman_composite_src_n_0565_process_pixblock_head
1037 .macro pixman_composite_src_n_0565_process_pixblock_tail
1040 .macro pixman_composite_src_n_0565_process_pixblock_tail_head
1041 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
1044 .macro pixman_composite_src_n_0565_init
1045 add DUMMY, sp, #ARGS_STACK_OFFSET
1046 vld1.32 {d0[0]}, [DUMMY]
1047 vsli.u64 d0, d0, #16
1048 vsli.u64 d0, d0, #32
1053 .macro pixman_composite_src_n_0565_cleanup
1056 generate_composite_function \
1057 pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
1058 FLAG_DST_WRITEONLY, \
1059 16, /* number of pixels, processed in a single block */ \
1060 0, /* prefetch distance */ \
1061 pixman_composite_src_n_0565_init, \
1062 pixman_composite_src_n_0565_cleanup, \
1063 pixman_composite_src_n_0565_process_pixblock_head, \
1064 pixman_composite_src_n_0565_process_pixblock_tail, \
1065 pixman_composite_src_n_0565_process_pixblock_tail_head, \
1066 0, /* dst_w_basereg */ \
1067 0, /* dst_r_basereg */ \
1068 0, /* src_basereg */ \
1069 0 /* mask_basereg */
1071 /******************************************************************************/
1073 .macro pixman_composite_src_n_8888_process_pixblock_head
1076 .macro pixman_composite_src_n_8888_process_pixblock_tail
1079 .macro pixman_composite_src_n_8888_process_pixblock_tail_head
1080 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1083 .macro pixman_composite_src_n_8888_init
1084 add DUMMY, sp, #ARGS_STACK_OFFSET
1085 vld1.32 {d0[0]}, [DUMMY]
1086 vsli.u64 d0, d0, #32
1091 .macro pixman_composite_src_n_8888_cleanup
1094 generate_composite_function \
1095 pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
1096 FLAG_DST_WRITEONLY, \
1097 8, /* number of pixels, processed in a single block */ \
1098 0, /* prefetch distance */ \
1099 pixman_composite_src_n_8888_init, \
1100 pixman_composite_src_n_8888_cleanup, \
1101 pixman_composite_src_n_8888_process_pixblock_head, \
1102 pixman_composite_src_n_8888_process_pixblock_tail, \
1103 pixman_composite_src_n_8888_process_pixblock_tail_head, \
1104 0, /* dst_w_basereg */ \
1105 0, /* dst_r_basereg */ \
1106 0, /* src_basereg */ \
1107 0 /* mask_basereg */
1109 /******************************************************************************/
1111 .macro pixman_composite_src_8888_8888_process_pixblock_head
1114 .macro pixman_composite_src_8888_8888_process_pixblock_tail
1117 .macro pixman_composite_src_8888_8888_process_pixblock_tail_head
1118 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1123 generate_composite_function \
1124 pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
1125 FLAG_DST_WRITEONLY, \
1126 8, /* number of pixels, processed in a single block */ \
1127 10, /* prefetch distance */ \
1130 pixman_composite_src_8888_8888_process_pixblock_head, \
1131 pixman_composite_src_8888_8888_process_pixblock_tail, \
1132 pixman_composite_src_8888_8888_process_pixblock_tail_head, \
1133 0, /* dst_w_basereg */ \
1134 0, /* dst_r_basereg */ \
1135 0, /* src_basereg */ \
1136 0 /* mask_basereg */
1138 /******************************************************************************/
1140 .macro pixman_composite_src_x888_8888_process_pixblock_head
1145 .macro pixman_composite_src_x888_8888_process_pixblock_tail
1148 .macro pixman_composite_src_x888_8888_process_pixblock_tail_head
1149 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1156 .macro pixman_composite_src_x888_8888_init
1158 vshl.u32 q2, q2, #24
1161 generate_composite_function \
1162 pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
1163 FLAG_DST_WRITEONLY, \
1164 8, /* number of pixels, processed in a single block */ \
1165 10, /* prefetch distance */ \
1166 pixman_composite_src_x888_8888_init, \
1168 pixman_composite_src_x888_8888_process_pixblock_head, \
1169 pixman_composite_src_x888_8888_process_pixblock_tail, \
1170 pixman_composite_src_x888_8888_process_pixblock_tail_head, \
1171 0, /* dst_w_basereg */ \
1172 0, /* dst_r_basereg */ \
1173 0, /* src_basereg */ \
1174 0 /* mask_basereg */
1176 /******************************************************************************/
1178 .macro pixman_composite_over_n_8_8888_process_pixblock_head
1179 /* expecting deinterleaved source data in {d8, d9, d10, d11} */
1180 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1181 /* and destination data in {d4, d5, d6, d7} */
1182 /* mask is in d24 (d25, d26, d27 are unused) */
1185 vmull.u8 q0, d24, d8
1186 vmull.u8 q1, d24, d9
1187 vmull.u8 q6, d24, d10
1188 vmull.u8 q7, d24, d11
1189 vrshr.u16 q10, q0, #8
1190 vrshr.u16 q11, q1, #8
1191 vrshr.u16 q12, q6, #8
1192 vrshr.u16 q13, q7, #8
1193 vraddhn.u16 d0, q0, q10
1194 vraddhn.u16 d1, q1, q11
1195 vraddhn.u16 d2, q6, q12
1196 vraddhn.u16 d3, q7, q13
1197 vmvn.8 d24, d3 /* get inverted alpha */
1198 /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */
1199 /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
1200 /* now do alpha blending */
1201 vmull.u8 q8, d24, d4
1202 vmull.u8 q9, d24, d5
1203 vmull.u8 q10, d24, d6
1204 vmull.u8 q11, d24, d7
1207 .macro pixman_composite_over_n_8_8888_process_pixblock_tail
1208 vrshr.u16 q14, q8, #8
1209 vrshr.u16 q15, q9, #8
1210 vrshr.u16 q12, q10, #8
1211 vrshr.u16 q13, q11, #8
1212 vraddhn.u16 d28, q14, q8
1213 vraddhn.u16 d29, q15, q9
1214 vraddhn.u16 d30, q12, q10
1215 vraddhn.u16 d31, q13, q11
1216 vqadd.u8 q14, q0, q14
1217 vqadd.u8 q15, q1, q15
1220 /* TODO: expand macros and do better instructions scheduling */
1221 .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
1222 pixman_composite_over_n_8_8888_process_pixblock_tail
1223 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1224 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1227 pixman_composite_over_n_8_8888_process_pixblock_head
1230 .macro pixman_composite_over_n_8_8888_init
1231 add DUMMY, sp, #ARGS_STACK_OFFSET
1233 vld1.32 {d11[0]}, [DUMMY]
1240 .macro pixman_composite_over_n_8_8888_cleanup
1244 generate_composite_function \
1245 pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
1246 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1247 8, /* number of pixels, processed in a single block */ \
1248 5, /* prefetch distance */ \
1249 pixman_composite_over_n_8_8888_init, \
1250 pixman_composite_over_n_8_8888_cleanup, \
1251 pixman_composite_over_n_8_8888_process_pixblock_head, \
1252 pixman_composite_over_n_8_8888_process_pixblock_tail, \
1253 pixman_composite_over_n_8_8888_process_pixblock_tail_head
1255 /******************************************************************************/
1257 .macro pixman_composite_over_n_8_8_process_pixblock_head
1258 vmull.u8 q0, d24, d8
1259 vmull.u8 q1, d25, d8
1260 vmull.u8 q6, d26, d8
1261 vmull.u8 q7, d27, d8
1262 vrshr.u16 q10, q0, #8
1263 vrshr.u16 q11, q1, #8
1264 vrshr.u16 q12, q6, #8
1265 vrshr.u16 q13, q7, #8
1266 vraddhn.u16 d0, q0, q10
1267 vraddhn.u16 d1, q1, q11
1268 vraddhn.u16 d2, q6, q12
1269 vraddhn.u16 d3, q7, q13
1272 vmull.u8 q8, d24, d4
1273 vmull.u8 q9, d25, d5
1274 vmull.u8 q10, d26, d6
1275 vmull.u8 q11, d27, d7
1278 .macro pixman_composite_over_n_8_8_process_pixblock_tail
1279 vrshr.u16 q14, q8, #8
1280 vrshr.u16 q15, q9, #8
1281 vrshr.u16 q12, q10, #8
1282 vrshr.u16 q13, q11, #8
1283 vraddhn.u16 d28, q14, q8
1284 vraddhn.u16 d29, q15, q9
1285 vraddhn.u16 d30, q12, q10
1286 vraddhn.u16 d31, q13, q11
1287 vqadd.u8 q14, q0, q14
1288 vqadd.u8 q15, q1, q15
1291 /* TODO: expand macros and do better instructions scheduling */
1292 .macro pixman_composite_over_n_8_8_process_pixblock_tail_head
1293 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1294 pixman_composite_over_n_8_8_process_pixblock_tail
1296 cache_preload 32, 32
1297 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1298 pixman_composite_over_n_8_8_process_pixblock_head
1301 .macro pixman_composite_over_n_8_8_init
1302 add DUMMY, sp, #ARGS_STACK_OFFSET
1304 vld1.32 {d8[0]}, [DUMMY]
1308 .macro pixman_composite_over_n_8_8_cleanup
1312 generate_composite_function \
1313 pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
1314 FLAG_DST_READWRITE, \
1315 32, /* number of pixels, processed in a single block */ \
1316 5, /* prefetch distance */ \
1317 pixman_composite_over_n_8_8_init, \
1318 pixman_composite_over_n_8_8_cleanup, \
1319 pixman_composite_over_n_8_8_process_pixblock_head, \
1320 pixman_composite_over_n_8_8_process_pixblock_tail, \
1321 pixman_composite_over_n_8_8_process_pixblock_tail_head
1323 /******************************************************************************/
1325 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1327 * 'combine_mask_ca' replacement
1329 * input: solid src (n) in {d8, d9, d10, d11}
1330 * dest in {d4, d5, d6, d7 }
1331 * mask in {d24, d25, d26, d27}
1332 * output: updated src in {d0, d1, d2, d3 }
1333 * updated mask in {d24, d25, d26, d3 }
1335 vmull.u8 q0, d24, d8
1336 vmull.u8 q1, d25, d9
1337 vmull.u8 q6, d26, d10
1338 vmull.u8 q7, d27, d11
1339 vmull.u8 q9, d11, d25
1340 vmull.u8 q12, d11, d24
1341 vmull.u8 q13, d11, d26
1342 vrshr.u16 q8, q0, #8
1343 vrshr.u16 q10, q1, #8
1344 vrshr.u16 q11, q6, #8
1345 vraddhn.u16 d0, q0, q8
1346 vraddhn.u16 d1, q1, q10
1347 vraddhn.u16 d2, q6, q11
1348 vrshr.u16 q11, q12, #8
1349 vrshr.u16 q8, q9, #8
1350 vrshr.u16 q6, q13, #8
1351 vrshr.u16 q10, q7, #8
1352 vraddhn.u16 d24, q12, q11
1353 vraddhn.u16 d25, q9, q8
1354 vraddhn.u16 d26, q13, q6
1355 vraddhn.u16 d3, q7, q10
1357 * 'combine_over_ca' replacement
1359 * output: updated dest in {d28, d29, d30, d31}
1363 vmull.u8 q8, d24, d4
1364 vmull.u8 q9, d25, d5
1366 vmull.u8 q10, d26, d6
1367 vmull.u8 q11, d27, d7
1370 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
1371 /* ... continue 'combine_over_ca' replacement */
1372 vrshr.u16 q14, q8, #8
1373 vrshr.u16 q15, q9, #8
1374 vrshr.u16 q6, q10, #8
1375 vrshr.u16 q7, q11, #8
1376 vraddhn.u16 d28, q14, q8
1377 vraddhn.u16 d29, q15, q9
1378 vraddhn.u16 d30, q6, q10
1379 vraddhn.u16 d31, q7, q11
1380 vqadd.u8 q14, q0, q14
1381 vqadd.u8 q15, q1, q15
1384 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1385 vrshr.u16 q14, q8, #8
1386 vrshr.u16 q15, q9, #8
1387 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1388 vrshr.u16 q6, q10, #8
1389 vrshr.u16 q7, q11, #8
1390 vraddhn.u16 d28, q14, q8
1391 vraddhn.u16 d29, q15, q9
1392 vraddhn.u16 d30, q6, q10
1393 vraddhn.u16 d31, q7, q11
1395 vqadd.u8 q14, q0, q14
1396 vqadd.u8 q15, q1, q15
1398 pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1399 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1402 .macro pixman_composite_over_n_8888_8888_ca_init
1403 add DUMMY, sp, #ARGS_STACK_OFFSET
1405 vld1.32 {d11[0]}, [DUMMY]
1412 .macro pixman_composite_over_n_8888_8888_ca_cleanup
1416 generate_composite_function \
1417 pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
1418 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1419 8, /* number of pixels, processed in a single block */ \
1420 5, /* prefetch distance */ \
1421 pixman_composite_over_n_8888_8888_ca_init, \
1422 pixman_composite_over_n_8888_8888_ca_cleanup, \
1423 pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
1424 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
1425 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1427 /******************************************************************************/
1429 .macro pixman_composite_in_n_8_process_pixblock_head
1430 /* expecting source data in {d0, d1, d2, d3} */
1431 /* and destination data in {d4, d5, d6, d7} */
1434 vmull.u8 q10, d6, d3
1435 vmull.u8 q11, d7, d3
1438 .macro pixman_composite_in_n_8_process_pixblock_tail
1439 vrshr.u16 q14, q8, #8
1440 vrshr.u16 q15, q9, #8
1441 vrshr.u16 q12, q10, #8
1442 vrshr.u16 q13, q11, #8
1443 vraddhn.u16 d28, q8, q14
1444 vraddhn.u16 d29, q9, q15
1445 vraddhn.u16 d30, q10, q12
1446 vraddhn.u16 d31, q11, q13
1449 .macro pixman_composite_in_n_8_process_pixblock_tail_head
1450 pixman_composite_in_n_8_process_pixblock_tail
1451 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1452 cache_preload 32, 32
1453 pixman_composite_in_n_8_process_pixblock_head
1454 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1457 .macro pixman_composite_in_n_8_init
1458 add DUMMY, sp, #ARGS_STACK_OFFSET
1459 vld1.32 {d3[0]}, [DUMMY]
1463 .macro pixman_composite_in_n_8_cleanup
1466 generate_composite_function \
1467 pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
1468 FLAG_DST_READWRITE, \
1469 32, /* number of pixels, processed in a single block */ \
1470 5, /* prefetch distance */ \
1471 pixman_composite_in_n_8_init, \
1472 pixman_composite_in_n_8_cleanup, \
1473 pixman_composite_in_n_8_process_pixblock_head, \
1474 pixman_composite_in_n_8_process_pixblock_tail, \
1475 pixman_composite_in_n_8_process_pixblock_tail_head, \
1476 28, /* dst_w_basereg */ \
1477 4, /* dst_r_basereg */ \
1478 0, /* src_basereg */ \
1479 24 /* mask_basereg */
1481 .macro pixman_composite_add_n_8_8_process_pixblock_head
1482 /* expecting source data in {d8, d9, d10, d11} */
1483 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1484 /* and destination data in {d4, d5, d6, d7} */
1485 /* mask is in d24, d25, d26, d27 */
1486 vmull.u8 q0, d24, d11
1487 vmull.u8 q1, d25, d11
1488 vmull.u8 q6, d26, d11
1489 vmull.u8 q7, d27, d11
1490 vrshr.u16 q10, q0, #8
1491 vrshr.u16 q11, q1, #8
1492 vrshr.u16 q12, q6, #8
1493 vrshr.u16 q13, q7, #8
1494 vraddhn.u16 d0, q0, q10
1495 vraddhn.u16 d1, q1, q11
1496 vraddhn.u16 d2, q6, q12
1497 vraddhn.u16 d3, q7, q13
1498 vqadd.u8 q14, q0, q2
1499 vqadd.u8 q15, q1, q3
1502 .macro pixman_composite_add_n_8_8_process_pixblock_tail
1505 /* TODO: expand macros and do better instructions scheduling */
1506 .macro pixman_composite_add_n_8_8_process_pixblock_tail_head
1507 pixman_composite_add_n_8_8_process_pixblock_tail
1508 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1509 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1511 cache_preload 32, 32
1512 pixman_composite_add_n_8_8_process_pixblock_head
1515 .macro pixman_composite_add_n_8_8_init
1516 add DUMMY, sp, #ARGS_STACK_OFFSET
1518 vld1.32 {d11[0]}, [DUMMY]
1522 .macro pixman_composite_add_n_8_8_cleanup
1526 generate_composite_function \
1527 pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
1528 FLAG_DST_READWRITE, \
1529 32, /* number of pixels, processed in a single block */ \
1530 5, /* prefetch distance */ \
1531 pixman_composite_add_n_8_8_init, \
1532 pixman_composite_add_n_8_8_cleanup, \
1533 pixman_composite_add_n_8_8_process_pixblock_head, \
1534 pixman_composite_add_n_8_8_process_pixblock_tail, \
1535 pixman_composite_add_n_8_8_process_pixblock_tail_head
1537 /******************************************************************************/
1539 .macro pixman_composite_add_8_8_8_process_pixblock_head
1540 /* expecting source data in {d0, d1, d2, d3} */
1541 /* destination data in {d4, d5, d6, d7} */
1542 /* mask in {d24, d25, d26, d27} */
1543 vmull.u8 q8, d24, d0
1544 vmull.u8 q9, d25, d1
1545 vmull.u8 q10, d26, d2
1546 vmull.u8 q11, d27, d3
1547 vrshr.u16 q0, q8, #8
1548 vrshr.u16 q1, q9, #8
1549 vrshr.u16 q12, q10, #8
1550 vrshr.u16 q13, q11, #8
1551 vraddhn.u16 d0, q0, q8
1552 vraddhn.u16 d1, q1, q9
1553 vraddhn.u16 d2, q12, q10
1554 vraddhn.u16 d3, q13, q11
1555 vqadd.u8 q14, q0, q2
1556 vqadd.u8 q15, q1, q3
1559 .macro pixman_composite_add_8_8_8_process_pixblock_tail
1562 /* TODO: expand macros and do better instructions scheduling */
1563 .macro pixman_composite_add_8_8_8_process_pixblock_tail_head
1564 pixman_composite_add_8_8_8_process_pixblock_tail
1565 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1566 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1569 cache_preload 32, 32
1570 pixman_composite_add_8_8_8_process_pixblock_head
1573 .macro pixman_composite_add_8_8_8_init
1576 .macro pixman_composite_add_8_8_8_cleanup
1579 generate_composite_function \
1580 pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
1581 FLAG_DST_READWRITE, \
1582 32, /* number of pixels, processed in a single block */ \
1583 5, /* prefetch distance */ \
1584 pixman_composite_add_8_8_8_init, \
1585 pixman_composite_add_8_8_8_cleanup, \
1586 pixman_composite_add_8_8_8_process_pixblock_head, \
1587 pixman_composite_add_8_8_8_process_pixblock_tail, \
1588 pixman_composite_add_8_8_8_process_pixblock_tail_head
1590 /******************************************************************************/
1592 .macro pixman_composite_add_8888_8888_8888_process_pixblock_head
1593 /* expecting source data in {d0, d1, d2, d3} */
1594 /* destination data in {d4, d5, d6, d7} */
1595 /* mask in {d24, d25, d26, d27} */
1596 vmull.u8 q8, d27, d0
1597 vmull.u8 q9, d27, d1
1598 vmull.u8 q10, d27, d2
1599 vmull.u8 q11, d27, d3
1600 /* 1 cycle bubble */
1601 vrsra.u16 q8, q8, #8
1602 vrsra.u16 q9, q9, #8
1603 vrsra.u16 q10, q10, #8
1604 vrsra.u16 q11, q11, #8
1607 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
1608 /* 2 cycle bubble */
1609 vrshrn.u16 d28, q8, #8
1610 vrshrn.u16 d29, q9, #8
1611 vrshrn.u16 d30, q10, #8
1612 vrshrn.u16 d31, q11, #8
1613 vqadd.u8 q14, q2, q14
1614 /* 1 cycle bubble */
1615 vqadd.u8 q15, q3, q15
1618 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1620 vrshrn.u16 d28, q8, #8
1622 vrshrn.u16 d29, q9, #8
1623 vmull.u8 q8, d27, d0
1624 vrshrn.u16 d30, q10, #8
1625 vmull.u8 q9, d27, d1
1626 vrshrn.u16 d31, q11, #8
1627 vmull.u8 q10, d27, d2
1628 vqadd.u8 q14, q2, q14
1629 vmull.u8 q11, d27, d3
1630 vqadd.u8 q15, q3, q15
1631 vrsra.u16 q8, q8, #8
1632 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1633 vrsra.u16 q9, q9, #8
1634 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1635 vrsra.u16 q10, q10, #8
1639 vrsra.u16 q11, q11, #8
1642 generate_composite_function \
1643 pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
1644 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1645 8, /* number of pixels, processed in a single block */ \
1646 10, /* prefetch distance */ \
1649 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1650 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1651 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1653 generate_composite_function_single_scanline \
1654 pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
1655 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1656 8, /* number of pixels, processed in a single block */ \
1659 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1660 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1661 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1663 /******************************************************************************/
1665 generate_composite_function \
1666 pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
1667 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1668 8, /* number of pixels, processed in a single block */ \
1669 5, /* prefetch distance */ \
1672 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1673 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1674 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
1675 28, /* dst_w_basereg */ \
1676 4, /* dst_r_basereg */ \
1677 0, /* src_basereg */ \
1678 27 /* mask_basereg */
1680 /******************************************************************************/
1682 .macro pixman_composite_add_n_8_8888_init
1683 add DUMMY, sp, #ARGS_STACK_OFFSET
1684 vld1.32 {d3[0]}, [DUMMY]
1691 .macro pixman_composite_add_n_8_8888_cleanup
1694 generate_composite_function \
1695 pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
1696 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1697 8, /* number of pixels, processed in a single block */ \
1698 5, /* prefetch distance */ \
1699 pixman_composite_add_n_8_8888_init, \
1700 pixman_composite_add_n_8_8888_cleanup, \
1701 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1702 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1703 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
1704 28, /* dst_w_basereg */ \
1705 4, /* dst_r_basereg */ \
1706 0, /* src_basereg */ \
1707 27 /* mask_basereg */
1709 /******************************************************************************/
1711 .macro pixman_composite_add_8888_n_8888_init
1712 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
1713 vld1.32 {d27[0]}, [DUMMY]
1717 .macro pixman_composite_add_8888_n_8888_cleanup
1720 generate_composite_function \
1721 pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
1722 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1723 8, /* number of pixels, processed in a single block */ \
1724 5, /* prefetch distance */ \
1725 pixman_composite_add_8888_n_8888_init, \
1726 pixman_composite_add_8888_n_8888_cleanup, \
1727 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1728 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1729 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
1730 28, /* dst_w_basereg */ \
1731 4, /* dst_r_basereg */ \
1732 0, /* src_basereg */ \
1733 27 /* mask_basereg */
1735 /******************************************************************************/
1737 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
1738 /* expecting source data in {d0, d1, d2, d3} */
1739 /* destination data in {d4, d5, d6, d7} */
1740 /* solid mask is in d15 */
1743 vmull.u8 q8, d15, d3
1744 vmull.u8 q6, d15, d2
1745 vmull.u8 q5, d15, d1
1746 vmull.u8 q4, d15, d0
1747 vrshr.u16 q13, q8, #8
1748 vrshr.u16 q12, q6, #8
1749 vrshr.u16 q11, q5, #8
1750 vrshr.u16 q10, q4, #8
1751 vraddhn.u16 d3, q8, q13
1752 vraddhn.u16 d2, q6, q12
1753 vraddhn.u16 d1, q5, q11
1754 vraddhn.u16 d0, q4, q10
1755 vmvn.8 d24, d3 /* get inverted alpha */
1756 /* now do alpha blending */
1757 vmull.u8 q8, d24, d4
1758 vmull.u8 q9, d24, d5
1759 vmull.u8 q10, d24, d6
1760 vmull.u8 q11, d24, d7
1763 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
1764 vrshr.u16 q14, q8, #8
1765 vrshr.u16 q15, q9, #8
1766 vrshr.u16 q12, q10, #8
1767 vrshr.u16 q13, q11, #8
1768 vraddhn.u16 d28, q14, q8
1769 vraddhn.u16 d29, q15, q9
1770 vraddhn.u16 d30, q12, q10
1771 vraddhn.u16 d31, q13, q11
1774 /* TODO: expand macros and do better instructions scheduling */
1775 .macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
1776 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1777 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
1781 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
1782 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1785 generate_composite_function_single_scanline \
1786 pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
1787 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1788 8, /* number of pixels, processed in a single block */ \
1789 default_init_need_all_regs, \
1790 default_cleanup_need_all_regs, \
1791 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
1792 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
1793 pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
1794 28, /* dst_w_basereg */ \
1795 4, /* dst_r_basereg */ \
1796 0, /* src_basereg */ \
1797 12 /* mask_basereg */
1799 /******************************************************************************/
1801 .macro pixman_composite_over_8888_n_8888_process_pixblock_head
1802 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
1805 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail
1806 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
1807 vqadd.u8 q14, q0, q14
1808 vqadd.u8 q15, q1, q15
1811 /* TODO: expand macros and do better instructions scheduling */
1812 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
1813 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1814 pixman_composite_over_8888_n_8888_process_pixblock_tail
1817 pixman_composite_over_8888_n_8888_process_pixblock_head
1818 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1821 .macro pixman_composite_over_8888_n_8888_init
1824 vld1.32 {d15[0]}, [DUMMY]
1828 .macro pixman_composite_over_8888_n_8888_cleanup
1832 generate_composite_function \
1833 pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
1834 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1835 8, /* number of pixels, processed in a single block */ \
1836 5, /* prefetch distance */ \
1837 pixman_composite_over_8888_n_8888_init, \
1838 pixman_composite_over_8888_n_8888_cleanup, \
1839 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1840 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1841 pixman_composite_over_8888_n_8888_process_pixblock_tail_head
1843 /******************************************************************************/
1845 /* TODO: expand macros and do better instructions scheduling */
1846 .macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
1847 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1848 pixman_composite_over_8888_n_8888_process_pixblock_tail
1852 pixman_composite_over_8888_n_8888_process_pixblock_head
1853 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1856 generate_composite_function \
1857 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
1858 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1859 8, /* number of pixels, processed in a single block */ \
1860 5, /* prefetch distance */ \
1861 default_init_need_all_regs, \
1862 default_cleanup_need_all_regs, \
1863 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1864 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1865 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
1866 28, /* dst_w_basereg */ \
1867 4, /* dst_r_basereg */ \
1868 0, /* src_basereg */ \
1869 12 /* mask_basereg */
1871 generate_composite_function_single_scanline \
1872 pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
1873 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1874 8, /* number of pixels, processed in a single block */ \
1875 default_init_need_all_regs, \
1876 default_cleanup_need_all_regs, \
1877 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1878 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1879 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
1880 28, /* dst_w_basereg */ \
1881 4, /* dst_r_basereg */ \
1882 0, /* src_basereg */ \
1883 12 /* mask_basereg */
1885 /******************************************************************************/
1887 /* TODO: expand macros and do better instructions scheduling */
1888 .macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
1889 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1890 pixman_composite_over_8888_n_8888_process_pixblock_tail
1894 pixman_composite_over_8888_n_8888_process_pixblock_head
1895 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1898 generate_composite_function \
1899 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
1900 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1901 8, /* number of pixels, processed in a single block */ \
1902 5, /* prefetch distance */ \
1903 default_init_need_all_regs, \
1904 default_cleanup_need_all_regs, \
1905 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1906 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1907 pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
1908 28, /* dst_w_basereg */ \
1909 4, /* dst_r_basereg */ \
1910 0, /* src_basereg */ \
1911 15 /* mask_basereg */
1913 /******************************************************************************/
1915 .macro pixman_composite_src_0888_0888_process_pixblock_head
1918 .macro pixman_composite_src_0888_0888_process_pixblock_tail
1921 .macro pixman_composite_src_0888_0888_process_pixblock_tail_head
1922 vst3.8 {d0, d1, d2}, [DST_W]!
1927 generate_composite_function \
1928 pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
1929 FLAG_DST_WRITEONLY, \
1930 8, /* number of pixels, processed in a single block */ \
1931 10, /* prefetch distance */ \
1934 pixman_composite_src_0888_0888_process_pixblock_head, \
1935 pixman_composite_src_0888_0888_process_pixblock_tail, \
1936 pixman_composite_src_0888_0888_process_pixblock_tail_head, \
1937 0, /* dst_w_basereg */ \
1938 0, /* dst_r_basereg */ \
1939 0, /* src_basereg */ \
1940 0 /* mask_basereg */
1942 /******************************************************************************/
1944 .macro pixman_composite_src_0888_8888_rev_process_pixblock_head
1948 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
1951 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
1952 vst4.8 {d0, d1, d2, d3}, [DST_W]!
1958 .macro pixman_composite_src_0888_8888_rev_init
1962 generate_composite_function \
1963 pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
1964 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1965 8, /* number of pixels, processed in a single block */ \
1966 10, /* prefetch distance */ \
1967 pixman_composite_src_0888_8888_rev_init, \
1969 pixman_composite_src_0888_8888_rev_process_pixblock_head, \
1970 pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
1971 pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
1972 0, /* dst_w_basereg */ \
1973 0, /* dst_r_basereg */ \
1974 0, /* src_basereg */ \
1975 0 /* mask_basereg */
1977 /******************************************************************************/
1979 .macro pixman_composite_src_0888_0565_rev_process_pixblock_head
1984 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
1985 vshll.u8 q14, d0, #8
1986 vsri.u16 q14, q8, #5
1987 vsri.u16 q14, q9, #11
1990 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
1991 vshll.u8 q14, d0, #8
1993 vsri.u16 q14, q8, #5
1994 vsri.u16 q14, q9, #11
1996 vst1.16 {d28, d29}, [DST_W, :128]!
2000 generate_composite_function \
2001 pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
2002 FLAG_DST_WRITEONLY, \
2003 8, /* number of pixels, processed in a single block */ \
2004 10, /* prefetch distance */ \
2007 pixman_composite_src_0888_0565_rev_process_pixblock_head, \
2008 pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
2009 pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
2010 28, /* dst_w_basereg */ \
2011 0, /* dst_r_basereg */ \
2012 0, /* src_basereg */ \
2013 0 /* mask_basereg */
2015 /******************************************************************************/
2017 .macro pixman_composite_src_pixbuf_8888_process_pixblock_head
2020 vmull.u8 q10, d3, d2
2023 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
2024 vrshr.u16 q11, q8, #8
2026 vrshr.u16 q12, q9, #8
2027 vrshr.u16 q13, q10, #8
2028 vraddhn.u16 d30, q11, q8
2029 vraddhn.u16 d29, q12, q9
2030 vraddhn.u16 d28, q13, q10
2033 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
2034 vrshr.u16 q11, q8, #8
2036 vrshr.u16 q12, q9, #8
2037 vrshr.u16 q13, q10, #8
2039 vraddhn.u16 d30, q11, q8
2040 PF add PF_X, PF_X, #8
2042 PF addne PF_X, PF_X, #8
2043 PF subne PF_CTL, PF_CTL, #1
2044 vraddhn.u16 d29, q12, q9
2045 vraddhn.u16 d28, q13, q10
2048 vmull.u8 q10, d3, d2
2049 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2051 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
2052 PF subge PF_X, PF_X, ORIG_W
2053 PF subges PF_CTL, PF_CTL, #0x10
2054 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2057 generate_composite_function \
2058 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
2059 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2060 8, /* number of pixels, processed in a single block */ \
2061 10, /* prefetch distance */ \
2064 pixman_composite_src_pixbuf_8888_process_pixblock_head, \
2065 pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
2066 pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
2067 28, /* dst_w_basereg */ \
2068 0, /* dst_r_basereg */ \
2069 0, /* src_basereg */ \
2070 0 /* mask_basereg */
2072 /******************************************************************************/
2074 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
2077 vmull.u8 q10, d3, d2
2080 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
2081 vrshr.u16 q11, q8, #8
2083 vrshr.u16 q12, q9, #8
2084 vrshr.u16 q13, q10, #8
2085 vraddhn.u16 d28, q11, q8
2086 vraddhn.u16 d29, q12, q9
2087 vraddhn.u16 d30, q13, q10
2090 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
2091 vrshr.u16 q11, q8, #8
2093 vrshr.u16 q12, q9, #8
2094 vrshr.u16 q13, q10, #8
2096 vraddhn.u16 d28, q11, q8
2097 PF add PF_X, PF_X, #8
2099 PF addne PF_X, PF_X, #8
2100 PF subne PF_CTL, PF_CTL, #1
2101 vraddhn.u16 d29, q12, q9
2102 vraddhn.u16 d30, q13, q10
2105 vmull.u8 q10, d3, d2
2106 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2108 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
2109 PF subge PF_X, PF_X, ORIG_W
2110 PF subges PF_CTL, PF_CTL, #0x10
2111 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2114 generate_composite_function \
2115 pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
2116 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2117 8, /* number of pixels, processed in a single block */ \
2118 10, /* prefetch distance */ \
2121 pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
2122 pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
2123 pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
2124 28, /* dst_w_basereg */ \
2125 0, /* dst_r_basereg */ \
2126 0, /* src_basereg */ \
2127 0 /* mask_basereg */
2129 /******************************************************************************/
2131 .macro pixman_composite_over_0565_8_0565_process_pixblock_head
2132 /* mask is in d15 */
2133 convert_0565_to_x888 q4, d2, d1, d0
2134 convert_0565_to_x888 q5, d6, d5, d4
2135 /* source pixel data is in {d0, d1, d2, XX} */
2136 /* destination pixel data is in {d4, d5, d6, XX} */
2138 vmull.u8 q6, d15, d2
2139 vmull.u8 q5, d15, d1
2140 vmull.u8 q4, d15, d0
2143 vmull.u8 q13, d7, d6
2144 vrshr.u16 q12, q6, #8
2145 vrshr.u16 q11, q5, #8
2146 vrshr.u16 q10, q4, #8
2147 vraddhn.u16 d2, q6, q12
2148 vraddhn.u16 d1, q5, q11
2149 vraddhn.u16 d0, q4, q10
2152 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail
2153 vrshr.u16 q14, q8, #8
2154 vrshr.u16 q15, q9, #8
2155 vrshr.u16 q12, q13, #8
2156 vraddhn.u16 d28, q14, q8
2157 vraddhn.u16 d29, q15, q9
2158 vraddhn.u16 d30, q12, q13
2159 vqadd.u8 q0, q0, q14
2160 vqadd.u8 q1, q1, q15
2161 /* 32bpp result is in {d0, d1, d2, XX} */
2162 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2165 /* TODO: expand macros and do better instructions scheduling */
2166 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
2168 pixman_composite_over_0565_8_0565_process_pixblock_tail
2170 vld1.16 {d10, d11}, [DST_R, :128]!
2172 pixman_composite_over_0565_8_0565_process_pixblock_head
2173 vst1.16 {d28, d29}, [DST_W, :128]!
2176 generate_composite_function \
2177 pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
2178 FLAG_DST_READWRITE, \
2179 8, /* number of pixels, processed in a single block */ \
2180 5, /* prefetch distance */ \
2181 default_init_need_all_regs, \
2182 default_cleanup_need_all_regs, \
2183 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2184 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2185 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2186 28, /* dst_w_basereg */ \
2187 10, /* dst_r_basereg */ \
2188 8, /* src_basereg */ \
2189 15 /* mask_basereg */
2191 /******************************************************************************/
2193 .macro pixman_composite_over_0565_n_0565_init
2194 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
2196 vld1.32 {d15[0]}, [DUMMY]
2200 .macro pixman_composite_over_0565_n_0565_cleanup
2204 generate_composite_function \
2205 pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
2206 FLAG_DST_READWRITE, \
2207 8, /* number of pixels, processed in a single block */ \
2208 5, /* prefetch distance */ \
2209 pixman_composite_over_0565_n_0565_init, \
2210 pixman_composite_over_0565_n_0565_cleanup, \
2211 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2212 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2213 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2214 28, /* dst_w_basereg */ \
2215 10, /* dst_r_basereg */ \
2216 8, /* src_basereg */ \
2217 15 /* mask_basereg */
2219 /******************************************************************************/
2221 .macro pixman_composite_add_0565_8_0565_process_pixblock_head
2222 /* mask is in d15 */
2223 convert_0565_to_x888 q4, d2, d1, d0
2224 convert_0565_to_x888 q5, d6, d5, d4
2225 /* source pixel data is in {d0, d1, d2, XX} */
2226 /* destination pixel data is in {d4, d5, d6, XX} */
2227 vmull.u8 q6, d15, d2
2228 vmull.u8 q5, d15, d1
2229 vmull.u8 q4, d15, d0
2230 vrshr.u16 q12, q6, #8
2231 vrshr.u16 q11, q5, #8
2232 vrshr.u16 q10, q4, #8
2233 vraddhn.u16 d2, q6, q12
2234 vraddhn.u16 d1, q5, q11
2235 vraddhn.u16 d0, q4, q10
2238 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail
2241 /* 32bpp result is in {d0, d1, d2, XX} */
2242 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2245 /* TODO: expand macros and do better instructions scheduling */
2246 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
2248 pixman_composite_add_0565_8_0565_process_pixblock_tail
2250 vld1.16 {d10, d11}, [DST_R, :128]!
2252 pixman_composite_add_0565_8_0565_process_pixblock_head
2253 vst1.16 {d28, d29}, [DST_W, :128]!
2256 generate_composite_function \
2257 pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
2258 FLAG_DST_READWRITE, \
2259 8, /* number of pixels, processed in a single block */ \
2260 5, /* prefetch distance */ \
2261 default_init_need_all_regs, \
2262 default_cleanup_need_all_regs, \
2263 pixman_composite_add_0565_8_0565_process_pixblock_head, \
2264 pixman_composite_add_0565_8_0565_process_pixblock_tail, \
2265 pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
2266 28, /* dst_w_basereg */ \
2267 10, /* dst_r_basereg */ \
2268 8, /* src_basereg */ \
2269 15 /* mask_basereg */
2271 /******************************************************************************/
2273 .macro pixman_composite_out_reverse_8_0565_process_pixblock_head
2274 /* mask is in d15 */
2275 convert_0565_to_x888 q5, d6, d5, d4
2276 /* destination pixel data is in {d4, d5, d6, xx} */
2277 vmvn.8 d24, d15 /* get inverted alpha */
2278 /* now do alpha blending */
2279 vmull.u8 q8, d24, d4
2280 vmull.u8 q9, d24, d5
2281 vmull.u8 q10, d24, d6
2284 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
2285 vrshr.u16 q14, q8, #8
2286 vrshr.u16 q15, q9, #8
2287 vrshr.u16 q12, q10, #8
2288 vraddhn.u16 d0, q14, q8
2289 vraddhn.u16 d1, q15, q9
2290 vraddhn.u16 d2, q12, q10
2291 /* 32bpp result is in {d0, d1, d2, XX} */
2292 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2295 /* TODO: expand macros and do better instructions scheduling */
2296 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
2298 pixman_composite_out_reverse_8_0565_process_pixblock_tail
2299 vld1.16 {d10, d11}, [DST_R, :128]!
2301 pixman_composite_out_reverse_8_0565_process_pixblock_head
2302 vst1.16 {d28, d29}, [DST_W, :128]!
2305 generate_composite_function \
2306 pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
2307 FLAG_DST_READWRITE, \
2308 8, /* number of pixels, processed in a single block */ \
2309 5, /* prefetch distance */ \
2310 default_init_need_all_regs, \
2311 default_cleanup_need_all_regs, \
2312 pixman_composite_out_reverse_8_0565_process_pixblock_head, \
2313 pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
2314 pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
2315 28, /* dst_w_basereg */ \
2316 10, /* dst_r_basereg */ \
2317 15, /* src_basereg */ \
2318 0 /* mask_basereg */
2320 /******************************************************************************/
2322 generate_composite_function_nearest_scanline \
2323 pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
2324 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2325 8, /* number of pixels, processed in a single block */ \
2328 pixman_composite_over_8888_8888_process_pixblock_head, \
2329 pixman_composite_over_8888_8888_process_pixblock_tail, \
2330 pixman_composite_over_8888_8888_process_pixblock_tail_head
2332 generate_composite_function_nearest_scanline \
2333 pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
2334 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2335 8, /* number of pixels, processed in a single block */ \
2338 pixman_composite_over_8888_0565_process_pixblock_head, \
2339 pixman_composite_over_8888_0565_process_pixblock_tail, \
2340 pixman_composite_over_8888_0565_process_pixblock_tail_head, \
2341 28, /* dst_w_basereg */ \
2342 4, /* dst_r_basereg */ \
2343 0, /* src_basereg */ \
2344 24 /* mask_basereg */
2346 generate_composite_function_nearest_scanline \
2347 pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
2348 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2349 8, /* number of pixels, processed in a single block */ \
2352 pixman_composite_src_8888_0565_process_pixblock_head, \
2353 pixman_composite_src_8888_0565_process_pixblock_tail, \
2354 pixman_composite_src_8888_0565_process_pixblock_tail_head
2356 generate_composite_function_nearest_scanline \
2357 pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
2358 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2359 8, /* number of pixels, processed in a single block */ \
2362 pixman_composite_src_0565_8888_process_pixblock_head, \
2363 pixman_composite_src_0565_8888_process_pixblock_tail, \
2364 pixman_composite_src_0565_8888_process_pixblock_tail_head
2366 generate_composite_function_nearest_scanline \
2367 pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
2368 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2369 8, /* number of pixels, processed in a single block */ \
2370 default_init_need_all_regs, \
2371 default_cleanup_need_all_regs, \
2372 pixman_composite_over_8888_8_0565_process_pixblock_head, \
2373 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
2374 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
2375 28, /* dst_w_basereg */ \
2376 4, /* dst_r_basereg */ \
2377 8, /* src_basereg */ \
2378 24 /* mask_basereg */
2380 generate_composite_function_nearest_scanline \
2381 pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
2382 FLAG_DST_READWRITE, \
2383 8, /* number of pixels, processed in a single block */ \
2384 default_init_need_all_regs, \
2385 default_cleanup_need_all_regs, \
2386 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2387 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2388 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2389 28, /* dst_w_basereg */ \
2390 10, /* dst_r_basereg */ \
2391 8, /* src_basereg */ \
2392 15 /* mask_basereg */
2394 /******************************************************************************/
2396 /* Supplementary macro for setting function attributes */
2397 .macro pixman_asm_function fname
2402 .type fname, %function
2408 * Bilinear scaling support code which tries to provide pixel fetching, color
2409 * format conversion, and interpolation as separate macros which can be used
2410 * as the basic building blocks for constructing bilinear scanline functions.
2413 .macro bilinear_load_8888 reg1, reg2, tmp
2414 mov TMP1, X, asr #16
2416 add TMP1, TOP, TMP1, asl #2
2417 vld1.32 {reg1}, [TMP1], STRIDE
2418 vld1.32 {reg2}, [TMP1]
2421 .macro bilinear_load_0565 reg1, reg2, tmp
2422 mov TMP1, X, asr #16
2424 add TMP1, TOP, TMP1, asl #1
2425 vld1.32 {reg2[0]}, [TMP1], STRIDE
2426 vld1.32 {reg2[1]}, [TMP1]
2427 convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
2430 .macro bilinear_load_and_vertical_interpolate_two_8888 \
2431 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
2433 bilinear_load_8888 reg1, reg2, tmp1
2434 vmull.u8 acc1, reg1, d28
2435 vmlal.u8 acc1, reg2, d29
2436 bilinear_load_8888 reg3, reg4, tmp2
2437 vmull.u8 acc2, reg3, d28
2438 vmlal.u8 acc2, reg4, d29
2441 .macro bilinear_load_and_vertical_interpolate_four_8888 \
2442 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2443 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2445 bilinear_load_and_vertical_interpolate_two_8888 \
2446 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
2447 bilinear_load_and_vertical_interpolate_two_8888 \
2448 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2451 .macro bilinear_load_and_vertical_interpolate_two_0565 \
2452 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
2454 mov TMP1, X, asr #16
2456 add TMP1, TOP, TMP1, asl #1
2457 mov TMP2, X, asr #16
2459 add TMP2, TOP, TMP2, asl #1
2460 vld1.32 {acc2lo[0]}, [TMP1], STRIDE
2461 vld1.32 {acc2hi[0]}, [TMP2], STRIDE
2462 vld1.32 {acc2lo[1]}, [TMP1]
2463 vld1.32 {acc2hi[1]}, [TMP2]
2464 convert_0565_to_x888 acc2, reg3, reg2, reg1
2469 vmull.u8 acc1, reg1, d28
2470 vmlal.u8 acc1, reg2, d29
2471 vmull.u8 acc2, reg3, d28
2472 vmlal.u8 acc2, reg4, d29
2475 .macro bilinear_load_and_vertical_interpolate_four_0565 \
2476 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2477 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2479 mov TMP1, X, asr #16
2481 add TMP1, TOP, TMP1, asl #1
2482 mov TMP2, X, asr #16
2484 add TMP2, TOP, TMP2, asl #1
2485 vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
2486 vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
2487 vld1.32 {xacc2lo[1]}, [TMP1]
2488 vld1.32 {xacc2hi[1]}, [TMP2]
2489 convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
2490 mov TMP1, X, asr #16
2492 add TMP1, TOP, TMP1, asl #1
2493 mov TMP2, X, asr #16
2495 add TMP2, TOP, TMP2, asl #1
2496 vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
2497 vzip.u8 xreg1, xreg3
2498 vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
2499 vzip.u8 xreg2, xreg4
2500 vld1.32 {yacc2lo[1]}, [TMP1]
2501 vzip.u8 xreg3, xreg4
2502 vld1.32 {yacc2hi[1]}, [TMP2]
2503 vzip.u8 xreg1, xreg2
2504 convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
2505 vmull.u8 xacc1, xreg1, d28
2506 vzip.u8 yreg1, yreg3
2507 vmlal.u8 xacc1, xreg2, d29
2508 vzip.u8 yreg2, yreg4
2509 vmull.u8 xacc2, xreg3, d28
2510 vzip.u8 yreg3, yreg4
2511 vmlal.u8 xacc2, xreg4, d29
2512 vzip.u8 yreg1, yreg2
2513 vmull.u8 yacc1, yreg1, d28
2514 vmlal.u8 yacc1, yreg2, d29
2515 vmull.u8 yacc2, yreg3, d28
2516 vmlal.u8 yacc2, yreg4, d29
2519 .macro bilinear_store_8888 numpix, tmp1, tmp2
2521 vst1.32 {d0, d1}, [OUT, :128]!
2523 vst1.32 {d0}, [OUT, :64]!
2525 vst1.32 {d0[0]}, [OUT, :32]!
2527 .error bilinear_store_8888 numpix is unsupported
2531 .macro bilinear_store_0565 numpix, tmp1, tmp2
2536 convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
2538 vst1.16 {d2}, [OUT, :64]!
2540 vst1.32 {d2[0]}, [OUT, :32]!
2542 vst1.16 {d2[0]}, [OUT, :16]!
2544 .error bilinear_store_0565 numpix is unsupported
2548 .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
2549 bilinear_load_&src_fmt d0, d1, d2
2550 vmull.u8 q1, d0, d28
2551 vmlal.u8 q1, d1, d29
2552 /* 5 cycles bubble */
2553 vshll.u16 q0, d2, #8
2554 vmlsl.u16 q0, d2, d30
2555 vmlal.u16 q0, d3, d30
2556 /* 5 cycles bubble */
2557 vshrn.u32 d0, q0, #16
2558 /* 3 cycles bubble */
2560 /* 1 cycle bubble */
2561 bilinear_store_&dst_fmt 1, q2, q3
2564 .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
2565 bilinear_load_and_vertical_interpolate_two_&src_fmt \
2566 q1, q11, d0, d1, d20, d21, d22, d23
2567 vshll.u16 q0, d2, #8
2568 vmlsl.u16 q0, d2, d30
2569 vmlal.u16 q0, d3, d30
2570 vshll.u16 q10, d22, #8
2571 vmlsl.u16 q10, d22, d31
2572 vmlal.u16 q10, d23, d31
2573 vshrn.u32 d0, q0, #16
2574 vshrn.u32 d1, q10, #16
2575 vshr.u16 q15, q12, #8
2576 vadd.u16 q12, q12, q13
2578 bilinear_store_&dst_fmt 2, q2, q3
2581 .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
2582 bilinear_load_and_vertical_interpolate_four_&src_fmt \
2583 q1, q11, d0, d1, d20, d21, d22, d23 \
2584 q3, q9, d4, d5, d16, d17, d18, d19
2586 sub TMP1, TMP1, STRIDE
2587 vshll.u16 q0, d2, #8
2588 vmlsl.u16 q0, d2, d30
2589 vmlal.u16 q0, d3, d30
2590 vshll.u16 q10, d22, #8
2591 vmlsl.u16 q10, d22, d31
2592 vmlal.u16 q10, d23, d31
2593 vshr.u16 q15, q12, #8
2594 vshll.u16 q2, d6, #8
2595 vmlsl.u16 q2, d6, d30
2596 vmlal.u16 q2, d7, d30
2597 vshll.u16 q8, d18, #8
2599 vmlsl.u16 q8, d18, d31
2600 vmlal.u16 q8, d19, d31
2601 vadd.u16 q12, q12, q13
2602 vshrn.u32 d0, q0, #16
2603 vshrn.u32 d1, q10, #16
2604 vshrn.u32 d4, q2, #16
2605 vshrn.u32 d5, q8, #16
2606 vshr.u16 q15, q12, #8
2609 vadd.u16 q12, q12, q13
2610 bilinear_store_&dst_fmt 4, q2, q3
2613 .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
2614 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
2615 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
2617 bilinear_interpolate_four_pixels src_fmt, dst_fmt
2621 .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
2622 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
2623 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
2627 .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
2628 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
2629 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
2631 bilinear_interpolate_four_pixels src_fmt, dst_fmt
2636 * Main template macro for generating NEON optimized bilinear scanline
2639 * Bilinear scanline scaler macro template uses the following arguments:
2640 * fname - name of the function to generate
2641 * src_fmt - source color format (8888 or 0565)
2642 * dst_fmt - destination color format (8888 or 0565)
2643 * bpp_shift - (1 << bpp_shift) is the size of source pixel in bytes
2644 * prefetch_distance - prefetch in the source image by that many
2648 .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
2649 src_bpp_shift, dst_bpp_shift, \
2652 pixman_asm_function fname
2669 push {r4, r5, r6, r7, r8, r9}
2670 mov PF_OFFS, #prefetch_distance
2671 ldmia ip, {WB, X, UX, WIDTH}
2672 mul PF_OFFS, PF_OFFS, UX
2674 sub STRIDE, BOTTOM, TOP
2684 vadd.u16 d25, d25, d26
2686 /* ensure good destination alignment */
2689 tst OUT, #(1 << dst_bpp_shift)
2691 vshr.u16 q15, q12, #8
2692 vadd.u16 q12, q12, q13
2693 bilinear_interpolate_last_pixel src_fmt, dst_fmt
2694 sub WIDTH, WIDTH, #1
2696 vadd.u16 q13, q13, q13
2697 vshr.u16 q15, q12, #8
2698 vadd.u16 q12, q12, q13
2702 tst OUT, #(1 << (dst_bpp_shift + 1))
2704 bilinear_interpolate_two_pixels src_fmt, dst_fmt
2705 sub WIDTH, WIDTH, #2
2708 /* start the main loop */
2709 subs WIDTH, WIDTH, #4
2711 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
2712 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
2713 subs WIDTH, WIDTH, #4
2716 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
2717 subs WIDTH, WIDTH, #4
2720 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
2723 /* handle the remaining trailing pixels */
2726 bilinear_interpolate_two_pixels src_fmt, dst_fmt
2730 bilinear_interpolate_last_pixel src_fmt, dst_fmt
2732 pop {r4, r5, r6, r7, r8, r9}
2752 generate_bilinear_scanline_func \
2753 pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, 2, 2, 28
2755 generate_bilinear_scanline_func \
2756 pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, 2, 1, 28
2758 generate_bilinear_scanline_func \
2759 pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, 1, 2, 28
2761 generate_bilinear_scanline_func \
2762 pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, 1, 1, 28