2 * Copyright © 2009 Nokia Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
27 * This file contains implementations of NEON optimized pixel processing
28 * functions. There is no full and detailed tutorial, but some functions
29 * (those which are exposing some new or interesting features) are
30 * extensively commented and can be used as examples.
32 * You may want to have a look at the comments for following functions:
33 * - pixman_composite_over_8888_0565_asm_neon
34 * - pixman_composite_over_n_8_0565_asm_neon
37 /* Prevent the stack from becoming executable for no reason... */
38 #if defined(__linux__) && defined(__ELF__)
39 .section .note.GNU-stack,"",%progbits
46 .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
47 .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
51 #include "pixman-arm-neon-asm.h"
53 /* Global configuration options and preferences */
56 * The code can optionally make use of unaligned memory accesses to improve
57 * performance of handling leading/trailing pixels for each scanline.
58 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
59 * example in linux if unaligned memory accesses are not configured to
60 * generate.exceptions.
62 .set RESPECT_STRICT_ALIGNMENT, 1
65 * Set default prefetch type. There is a choice between the following options:
67 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
68 * as NOP to workaround some HW bugs or for whatever other reason)
70 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
71 * advanced prefetch intruduces heavy overhead)
73 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
74 * which can run ARM and NEON instructions simultaneously so that extra ARM
75 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
77 * Note: some types of function can't support advanced prefetch and fallback
78 * to simple one (those which handle 24bpp pixels)
80 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
82 /* Prefetch distance in pixels for simple prefetch */
83 .set PREFETCH_DISTANCE_SIMPLE, 64
86 * Implementation of pixman_composite_over_8888_0565_asm_neon
88 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
89 * performs OVER compositing operation. Function fast_composite_over_8888_0565
90 * from pixman-fast-path.c does the same in C and can be used as a reference.
92 * First we need to have some NEON assembly code which can do the actual
93 * operation on the pixels and provide it to the template macro.
95 * Template macro quite conveniently takes care of emitting all the necessary
96 * code for memory reading and writing (including quite tricky cases of
97 * handling unaligned leading/trailing pixels), so we only need to deal with
98 * the data in NEON registers.
100 * NEON registers allocation in general is recommented to be the following:
101 * d0, d1, d2, d3 - contain loaded source pixel data
102 * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed)
103 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
104 * d28, d29, d30, d31 - place for storing the result (destination pixels)
106 * As can be seen above, four 64-bit NEON registers are used for keeping
107 * intermediate pixel data and up to 8 pixels can be processed in one step
108 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
110 * This particular function uses the following registers allocation:
111 * d0, d1, d2, d3 - contain loaded source pixel data
112 * d4, d5 - contain loaded destination pixels (they are needed)
113 * d28, d29 - place for storing the result (destination pixels)
117 * Step one. We need to have some code to do some arithmetics on pixel data.
118 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
119 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
120 * perform all the needed calculations and write the result to {d28, d29}.
121 * The rationale for having two macros and not just one will be explained
122 * later. In practice, any single monolitic function which does the work can
123 * be split into two parts in any arbitrary way without affecting correctness.
125 * There is one special trick here too. Common template macro can optionally
126 * make our life a bit easier by doing R, G, B, A color components
127 * deinterleaving for 32bpp pixel formats (and this feature is used in
128 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
129 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
130 * actually use d0 register for blue channel (a vector of eight 8-bit
131 * values), d1 register for green, d2 for red and d3 for alpha. This
132 * simple conversion can be also done with a few NEON instructions:
134 * Packed to planar conversion:
140 * Planar to packed conversion:
146 * But pixel can be loaded directly in planar format using VLD4.8 NEON
147 * instruction. It is 1 cycle slower than VLD1.32, so this is not always
148 * desirable, that's why deinterleaving is optional.
150 * But anyway, here is the code:
152 .macro pixman_composite_over_8888_0565_process_pixblock_head
153 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
154 and put data into d6 - red, d7 - green, d30 - blue */
159 vmvn.8 d3, d3 /* invert source alpha */
161 vshrn.u16 d30, q2, #2
162 /* now do alpha blending, storing results in 8-bit planar format
163 into d16 - red, d19 - green, d18 - blue */
166 vmull.u8 q12, d3, d30
167 vrshr.u16 q13, q10, #8
168 vrshr.u16 q3, q11, #8
169 vrshr.u16 q15, q12, #8
170 vraddhn.u16 d20, q10, q13
171 vraddhn.u16 d23, q11, q3
172 vraddhn.u16 d22, q12, q15
175 .macro pixman_composite_over_8888_0565_process_pixblock_tail
176 /* ... continue alpha blending */
177 vqadd.u8 d16, d2, d20
179 /* convert the result to r5g6b5 and store it into {d28, d29} */
180 vshll.u8 q14, d16, #8
184 vsri.u16 q14, q9, #11
188 * OK, now we got almost everything that we need. Using the above two
189 * macros, the work can be done right. But now we want to optimize
190 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
191 * a lot from good code scheduling and software pipelining.
193 * Let's construct some code, which will run in the core main loop.
194 * Some pseudo-code of the main loop will look like this:
202 * It may look a bit weird, but this setup allows to hide instruction
203 * latencies better and also utilize dual-issue capability more
204 * efficiently (make pairs of load-store and ALU instructions).
206 * So what we need now is a '*_tail_head' macro, which will be used
207 * in the core main loop. A trivial straightforward implementation
208 * of this macro would look like this:
210 * pixman_composite_over_8888_0565_process_pixblock_tail
211 * vst1.16 {d28, d29}, [DST_W, :128]!
212 * vld1.16 {d4, d5}, [DST_R, :128]!
213 * vld4.32 {d0, d1, d2, d3}, [SRC]!
214 * pixman_composite_over_8888_0565_process_pixblock_head
217 * Now it also got some VLD/VST instructions. We simply can't move from
218 * processing one block of pixels to the other one with just arithmetics.
219 * The previously processed data needs to be written to memory and new
220 * data needs to be fetched. Fortunately, this main loop does not deal
221 * with partial leading/trailing pixels and can load/store a full block
222 * of pixels in a bulk. Additionally, destination buffer is already
223 * 16 bytes aligned here (which is good for performance).
225 * New things here are DST_R, DST_W, SRC and MASK identifiers. These
226 * are the aliases for ARM registers which are used as pointers for
227 * accessing data. We maintain separate pointers for reading and writing
228 * destination buffer (DST_R and DST_W).
230 * Another new thing is 'cache_preload' macro. It is used for prefetching
231 * data into CPU L2 cache and improve performance when dealing with large
232 * images which are far larger than cache size. It uses one argument
233 * (actually two, but they need to be the same here) - number of pixels
234 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
235 * details about this macro. Moreover, if good performance is needed
236 * the code from this macro needs to be copied into '*_tail_head' macro
237 * and mixed with the rest of code for optimal instructions scheduling.
238 * We are actually doing it below.
240 * Now after all the explanations, here is the optimized code.
241 * Different instruction streams (originaling from '*_head', '*_tail'
242 * and 'cache_preload' macro) use different indentation levels for
243 * better readability. Actually taking the code from one of these
244 * indentation levels and ignoring a few VLD/VST instructions would
245 * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
251 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
252 vqadd.u8 d16, d2, d20
253 vld1.16 {d4, d5}, [DST_R, :128]!
259 vshll.u8 q14, d16, #8
260 PF add PF_X, PF_X, #8
264 PF addne PF_X, PF_X, #8
266 PF subne PF_CTL, PF_CTL, #1
268 vshrn.u16 d30, q2, #2
270 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
272 vmull.u8 q12, d3, d30
273 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
277 vrshr.u16 q13, q10, #8
278 PF subge PF_X, PF_X, ORIG_W
279 vrshr.u16 q3, q11, #8
280 vrshr.u16 q15, q12, #8
281 PF subges PF_CTL, PF_CTL, #0x10
282 vsri.u16 q14, q9, #11
283 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
284 vraddhn.u16 d20, q10, q13
285 vraddhn.u16 d23, q11, q3
286 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
287 vraddhn.u16 d22, q12, q15
288 vst1.16 {d28, d29}, [DST_W, :128]!
293 /* If we did not care much about the performance, we would just use this... */
294 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
295 pixman_composite_over_8888_0565_process_pixblock_tail
296 vst1.16 {d28, d29}, [DST_W, :128]!
297 vld1.16 {d4, d5}, [DST_R, :128]!
299 pixman_composite_over_8888_0565_process_pixblock_head
306 * And now the final part. We are using 'generate_composite_function' macro
307 * to put all the stuff together. We are specifying the name of the function
308 * which we want to get, number of bits per pixel for the source, mask and
309 * destination (0 if unused, like mask in this case). Next come some bit
311 * FLAG_DST_READWRITE - tells that the destination buffer is both read
312 * and written, for write-only buffer we would use
313 * FLAG_DST_WRITEONLY flag instead
314 * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
315 * and separate color channels for 32bpp format.
316 * The next things are:
317 * - the number of pixels processed per iteration (8 in this case, because
318 * that's the maximum what can fit into four 64-bit NEON registers).
319 * - prefetch distance, measured in pixel blocks. In this case it is 5 times
320 * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
321 * prefetch distance can be selected by running some benchmarks.
323 * After that we specify some macros, these are 'default_init',
324 * 'default_cleanup' here which are empty (but it is possible to have custom
325 * init/cleanup macros to be able to save/restore some extra NEON registers
326 * like d8-d15 or do anything else) followed by
327 * 'pixman_composite_over_8888_0565_process_pixblock_head',
328 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
329 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
330 * which we got implemented above.
332 * The last part is the NEON registers allocation scheme.
334 generate_composite_function \
335 pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
336 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
337 8, /* number of pixels, processed in a single block */ \
338 5, /* prefetch distance */ \
341 pixman_composite_over_8888_0565_process_pixblock_head, \
342 pixman_composite_over_8888_0565_process_pixblock_tail, \
343 pixman_composite_over_8888_0565_process_pixblock_tail_head, \
344 28, /* dst_w_basereg */ \
345 4, /* dst_r_basereg */ \
346 0, /* src_basereg */ \
347 24 /* mask_basereg */
349 /******************************************************************************/
351 .macro pixman_composite_over_n_0565_process_pixblock_head
352 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
353 and put data into d6 - red, d7 - green, d30 - blue */
359 vshrn.u16 d30, q2, #2
360 /* now do alpha blending, storing results in 8-bit planar format
361 into d16 - red, d19 - green, d18 - blue */
364 vmull.u8 q12, d3, d30
365 vrshr.u16 q13, q10, #8
366 vrshr.u16 q3, q11, #8
367 vrshr.u16 q15, q12, #8
368 vraddhn.u16 d20, q10, q13
369 vraddhn.u16 d23, q11, q3
370 vraddhn.u16 d22, q12, q15
373 .macro pixman_composite_over_n_0565_process_pixblock_tail
374 /* ... continue alpha blending */
375 vqadd.u8 d16, d2, d20
377 /* convert the result to r5g6b5 and store it into {d28, d29} */
378 vshll.u8 q14, d16, #8
382 vsri.u16 q14, q9, #11
385 /* TODO: expand macros and do better instructions scheduling */
386 .macro pixman_composite_over_n_0565_process_pixblock_tail_head
387 pixman_composite_over_n_0565_process_pixblock_tail
388 vld1.16 {d4, d5}, [DST_R, :128]!
389 vst1.16 {d28, d29}, [DST_W, :128]!
390 pixman_composite_over_n_0565_process_pixblock_head
394 .macro pixman_composite_over_n_0565_init
395 add DUMMY, sp, #ARGS_STACK_OFFSET
396 vld1.32 {d3[0]}, [DUMMY]
401 vmvn.8 d3, d3 /* invert source alpha */
404 generate_composite_function \
405 pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
406 FLAG_DST_READWRITE, \
407 8, /* number of pixels, processed in a single block */ \
408 5, /* prefetch distance */ \
409 pixman_composite_over_n_0565_init, \
411 pixman_composite_over_n_0565_process_pixblock_head, \
412 pixman_composite_over_n_0565_process_pixblock_tail, \
413 pixman_composite_over_n_0565_process_pixblock_tail_head, \
414 28, /* dst_w_basereg */ \
415 4, /* dst_r_basereg */ \
416 0, /* src_basereg */ \
417 24 /* mask_basereg */
419 /******************************************************************************/
421 .macro pixman_composite_src_8888_0565_process_pixblock_head
427 .macro pixman_composite_src_8888_0565_process_pixblock_tail
429 vsri.u16 q14, q9, #11
432 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
434 PF add PF_X, PF_X, #8
437 PF addne PF_X, PF_X, #8
438 PF subne PF_CTL, PF_CTL, #1
439 vsri.u16 q14, q9, #11
441 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
443 vst1.16 {d28, d29}, [DST_W, :128]!
444 PF subge PF_X, PF_X, ORIG_W
445 PF subges PF_CTL, PF_CTL, #0x10
447 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
451 generate_composite_function \
452 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
453 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
454 8, /* number of pixels, processed in a single block */ \
455 10, /* prefetch distance */ \
458 pixman_composite_src_8888_0565_process_pixblock_head, \
459 pixman_composite_src_8888_0565_process_pixblock_tail, \
460 pixman_composite_src_8888_0565_process_pixblock_tail_head
462 /******************************************************************************/
464 .macro pixman_composite_src_0565_8888_process_pixblock_head
465 vshrn.u16 d30, q0, #8
466 vshrn.u16 d29, q0, #3
471 vshrn.u16 d28, q0, #2
474 .macro pixman_composite_src_0565_8888_process_pixblock_tail
477 /* TODO: expand macros and do better instructions scheduling */
478 .macro pixman_composite_src_0565_8888_process_pixblock_tail_head
479 pixman_composite_src_0565_8888_process_pixblock_tail
480 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
482 pixman_composite_src_0565_8888_process_pixblock_head
486 generate_composite_function \
487 pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
488 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
489 8, /* number of pixels, processed in a single block */ \
490 10, /* prefetch distance */ \
493 pixman_composite_src_0565_8888_process_pixblock_head, \
494 pixman_composite_src_0565_8888_process_pixblock_tail, \
495 pixman_composite_src_0565_8888_process_pixblock_tail_head
497 /******************************************************************************/
499 .macro pixman_composite_add_8_8_process_pixblock_head
504 .macro pixman_composite_add_8_8_process_pixblock_tail
507 .macro pixman_composite_add_8_8_process_pixblock_tail_head
509 PF add PF_X, PF_X, #32
511 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
512 PF addne PF_X, PF_X, #32
513 PF subne PF_CTL, PF_CTL, #1
514 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
516 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
517 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
518 PF subge PF_X, PF_X, ORIG_W
519 PF subges PF_CTL, PF_CTL, #0x10
521 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
522 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
526 generate_composite_function \
527 pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
528 FLAG_DST_READWRITE, \
529 32, /* number of pixels, processed in a single block */ \
530 10, /* prefetch distance */ \
533 pixman_composite_add_8_8_process_pixblock_head, \
534 pixman_composite_add_8_8_process_pixblock_tail, \
535 pixman_composite_add_8_8_process_pixblock_tail_head
537 /******************************************************************************/
539 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
541 PF add PF_X, PF_X, #8
543 vld1.32 {d4, d5, d6, d7}, [DST_R, :128]!
544 PF addne PF_X, PF_X, #8
545 PF subne PF_CTL, PF_CTL, #1
546 vst1.32 {d28, d29, d30, d31}, [DST_W, :128]!
548 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
549 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
550 PF subge PF_X, PF_X, ORIG_W
551 PF subges PF_CTL, PF_CTL, #0x10
553 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
554 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
558 generate_composite_function \
559 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
560 FLAG_DST_READWRITE, \
561 8, /* number of pixels, processed in a single block */ \
562 10, /* prefetch distance */ \
565 pixman_composite_add_8_8_process_pixblock_head, \
566 pixman_composite_add_8_8_process_pixblock_tail, \
567 pixman_composite_add_8888_8888_process_pixblock_tail_head
569 generate_composite_function_single_scanline \
570 pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
571 FLAG_DST_READWRITE, \
572 8, /* number of pixels, processed in a single block */ \
575 pixman_composite_add_8_8_process_pixblock_head, \
576 pixman_composite_add_8_8_process_pixblock_tail, \
577 pixman_composite_add_8888_8888_process_pixblock_tail_head
579 /******************************************************************************/
581 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
582 vmvn.8 d24, d3 /* get inverted alpha */
583 /* do alpha blending */
586 vmull.u8 q10, d24, d6
587 vmull.u8 q11, d24, d7
590 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
591 vrshr.u16 q14, q8, #8
592 vrshr.u16 q15, q9, #8
593 vrshr.u16 q12, q10, #8
594 vrshr.u16 q13, q11, #8
595 vraddhn.u16 d28, q14, q8
596 vraddhn.u16 d29, q15, q9
597 vraddhn.u16 d30, q12, q10
598 vraddhn.u16 d31, q13, q11
601 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
602 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
603 vrshr.u16 q14, q8, #8
604 PF add PF_X, PF_X, #8
606 vrshr.u16 q15, q9, #8
607 vrshr.u16 q12, q10, #8
608 vrshr.u16 q13, q11, #8
609 PF addne PF_X, PF_X, #8
610 PF subne PF_CTL, PF_CTL, #1
611 vraddhn.u16 d28, q14, q8
612 vraddhn.u16 d29, q15, q9
614 vraddhn.u16 d30, q12, q10
615 vraddhn.u16 d31, q13, q11
617 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
619 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
620 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
621 PF subge PF_X, PF_X, ORIG_W
623 PF subges PF_CTL, PF_CTL, #0x10
625 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
626 vmull.u8 q10, d22, d6
627 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
628 vmull.u8 q11, d22, d7
631 generate_composite_function_single_scanline \
632 pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
633 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
634 8, /* number of pixels, processed in a single block */ \
637 pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
638 pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
639 pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
641 /******************************************************************************/
643 .macro pixman_composite_over_8888_8888_process_pixblock_head
644 pixman_composite_out_reverse_8888_8888_process_pixblock_head
647 .macro pixman_composite_over_8888_8888_process_pixblock_tail
648 pixman_composite_out_reverse_8888_8888_process_pixblock_tail
649 vqadd.u8 q14, q0, q14
650 vqadd.u8 q15, q1, q15
653 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
654 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
655 vrshr.u16 q14, q8, #8
656 PF add PF_X, PF_X, #8
658 vrshr.u16 q15, q9, #8
659 vrshr.u16 q12, q10, #8
660 vrshr.u16 q13, q11, #8
661 PF addne PF_X, PF_X, #8
662 PF subne PF_CTL, PF_CTL, #1
663 vraddhn.u16 d28, q14, q8
664 vraddhn.u16 d29, q15, q9
666 vraddhn.u16 d30, q12, q10
667 vraddhn.u16 d31, q13, q11
668 vqadd.u8 q14, q0, q14
669 vqadd.u8 q15, q1, q15
671 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
673 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
674 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
675 PF subge PF_X, PF_X, ORIG_W
677 PF subges PF_CTL, PF_CTL, #0x10
679 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
680 vmull.u8 q10, d22, d6
681 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
682 vmull.u8 q11, d22, d7
685 generate_composite_function \
686 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
687 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
688 8, /* number of pixels, processed in a single block */ \
689 5, /* prefetch distance */ \
692 pixman_composite_over_8888_8888_process_pixblock_head, \
693 pixman_composite_over_8888_8888_process_pixblock_tail, \
694 pixman_composite_over_8888_8888_process_pixblock_tail_head
696 generate_composite_function_single_scanline \
697 pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
698 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
699 8, /* number of pixels, processed in a single block */ \
702 pixman_composite_over_8888_8888_process_pixblock_head, \
703 pixman_composite_over_8888_8888_process_pixblock_tail, \
704 pixman_composite_over_8888_8888_process_pixblock_tail_head
706 /******************************************************************************/
708 /* TODO: expand macros and do better instructions scheduling */
709 .macro pixman_composite_over_n_8888_process_pixblock_tail_head
710 pixman_composite_over_8888_8888_process_pixblock_tail
711 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
712 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
713 pixman_composite_over_8888_8888_process_pixblock_head
717 .macro pixman_composite_over_n_8888_init
718 add DUMMY, sp, #ARGS_STACK_OFFSET
719 vld1.32 {d3[0]}, [DUMMY]
726 generate_composite_function \
727 pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
728 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
729 8, /* number of pixels, processed in a single block */ \
730 5, /* prefetch distance */ \
731 pixman_composite_over_n_8888_init, \
733 pixman_composite_over_8888_8888_process_pixblock_head, \
734 pixman_composite_over_8888_8888_process_pixblock_tail, \
735 pixman_composite_over_n_8888_process_pixblock_tail_head
737 /******************************************************************************/
739 .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
740 vrshr.u16 q14, q8, #8
741 PF add PF_X, PF_X, #8
743 vrshr.u16 q15, q9, #8
744 vrshr.u16 q12, q10, #8
745 vrshr.u16 q13, q11, #8
746 PF addne PF_X, PF_X, #8
747 PF subne PF_CTL, PF_CTL, #1
748 vraddhn.u16 d28, q14, q8
749 vraddhn.u16 d29, q15, q9
751 vraddhn.u16 d30, q12, q10
752 vraddhn.u16 d31, q13, q11
753 vqadd.u8 q14, q0, q14
754 vqadd.u8 q15, q1, q15
755 vld4.8 {d0, d1, d2, d3}, [DST_R, :128]!
757 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
758 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
759 PF subge PF_X, PF_X, ORIG_W
761 PF subges PF_CTL, PF_CTL, #0x10
763 vmull.u8 q10, d22, d6
764 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
765 vmull.u8 q11, d22, d7
768 .macro pixman_composite_over_reverse_n_8888_init
769 add DUMMY, sp, #ARGS_STACK_OFFSET
770 vld1.32 {d7[0]}, [DUMMY]
777 generate_composite_function \
778 pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
779 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
780 8, /* number of pixels, processed in a single block */ \
781 5, /* prefetch distance */ \
782 pixman_composite_over_reverse_n_8888_init, \
784 pixman_composite_over_8888_8888_process_pixblock_head, \
785 pixman_composite_over_8888_8888_process_pixblock_tail, \
786 pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
787 28, /* dst_w_basereg */ \
788 0, /* dst_r_basereg */ \
789 4, /* src_basereg */ \
790 24 /* mask_basereg */
792 /******************************************************************************/
794 .macro pixman_composite_over_8888_8_0565_process_pixblock_head
795 vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */
797 vmull.u8 q6, d24, d10
798 vmull.u8 q7, d24, d11
799 vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */
802 vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */
804 vrshr.u16 q10, q6, #8
805 vrshr.u16 q11, q7, #8
806 vraddhn.u16 d0, q0, q8
807 vraddhn.u16 d1, q1, q9
808 vraddhn.u16 d2, q6, q10
809 vraddhn.u16 d3, q7, q11
810 vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */
813 vshrn.u16 d30, q2, #2
814 vmull.u8 q8, d3, d6 /* now do alpha blending */
816 vmull.u8 q10, d3, d30
819 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail
820 /* 3 cycle bubble (after vmull.u8) */
821 vrshr.u16 q13, q8, #8
822 vrshr.u16 q11, q9, #8
823 vrshr.u16 q15, q10, #8
824 vraddhn.u16 d16, q8, q13
825 vraddhn.u16 d27, q9, q11
826 vraddhn.u16 d26, q10, q15
827 vqadd.u8 d16, d2, d16
830 vshll.u8 q14, d16, #8 /* convert to 16bpp */
835 vsri.u16 q14, q9, #11
838 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
839 vld1.16 {d4, d5}, [DST_R, :128]!
844 vmull.u8 q6, d24, d10
845 vrshr.u16 q13, q8, #8
846 vrshr.u16 q11, q9, #8
847 vrshr.u16 q15, q10, #8
848 vraddhn.u16 d16, q8, q13
849 vraddhn.u16 d27, q9, q11
850 vraddhn.u16 d26, q10, q15
851 vqadd.u8 d16, d2, d16
854 vshll.u8 q14, d16, #8
859 vmull.u8 q7, d24, d11
860 vsri.u16 q14, q9, #11
867 vrshr.u16 q10, q6, #8
868 vrshr.u16 q11, q7, #8
869 vraddhn.u16 d0, q0, q8
870 vraddhn.u16 d1, q1, q9
871 vraddhn.u16 d2, q6, q10
872 vraddhn.u16 d3, q7, q11
876 vshrn.u16 d30, q2, #2
877 vst1.16 {d28, d29}, [DST_W, :128]!
880 vmull.u8 q10, d3, d30
883 generate_composite_function \
884 pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
885 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
886 8, /* number of pixels, processed in a single block */ \
887 5, /* prefetch distance */ \
888 default_init_need_all_regs, \
889 default_cleanup_need_all_regs, \
890 pixman_composite_over_8888_8_0565_process_pixblock_head, \
891 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
892 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
893 28, /* dst_w_basereg */ \
894 4, /* dst_r_basereg */ \
895 8, /* src_basereg */ \
896 24 /* mask_basereg */
898 /******************************************************************************/
901 * This function needs a special initialization of solid mask.
902 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
903 * offset, split into color components and replicated in d8-d11
904 * registers. Additionally, this function needs all the NEON registers,
905 * so it has to save d8-d15 registers which are callee saved according
906 * to ABI. These registers are restored from 'cleanup' macro. All the
907 * other NEON registers are caller saved, so can be clobbered freely
908 * without introducing any problems.
910 .macro pixman_composite_over_n_8_0565_init
911 add DUMMY, sp, #ARGS_STACK_OFFSET
913 vld1.32 {d11[0]}, [DUMMY]
920 .macro pixman_composite_over_n_8_0565_cleanup
924 generate_composite_function \
925 pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
926 FLAG_DST_READWRITE, \
927 8, /* number of pixels, processed in a single block */ \
928 5, /* prefetch distance */ \
929 pixman_composite_over_n_8_0565_init, \
930 pixman_composite_over_n_8_0565_cleanup, \
931 pixman_composite_over_8888_8_0565_process_pixblock_head, \
932 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
933 pixman_composite_over_8888_8_0565_process_pixblock_tail_head
935 /******************************************************************************/
937 .macro pixman_composite_over_8888_n_0565_init
938 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
940 vld1.32 {d24[0]}, [DUMMY]
944 .macro pixman_composite_over_8888_n_0565_cleanup
948 generate_composite_function \
949 pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
950 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
951 8, /* number of pixels, processed in a single block */ \
952 5, /* prefetch distance */ \
953 pixman_composite_over_8888_n_0565_init, \
954 pixman_composite_over_8888_n_0565_cleanup, \
955 pixman_composite_over_8888_8_0565_process_pixblock_head, \
956 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
957 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
958 28, /* dst_w_basereg */ \
959 4, /* dst_r_basereg */ \
960 8, /* src_basereg */ \
961 24 /* mask_basereg */
963 /******************************************************************************/
965 .macro pixman_composite_src_0565_0565_process_pixblock_head
968 .macro pixman_composite_src_0565_0565_process_pixblock_tail
971 .macro pixman_composite_src_0565_0565_process_pixblock_tail_head
972 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
977 generate_composite_function \
978 pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
979 FLAG_DST_WRITEONLY, \
980 16, /* number of pixels, processed in a single block */ \
981 10, /* prefetch distance */ \
984 pixman_composite_src_0565_0565_process_pixblock_head, \
985 pixman_composite_src_0565_0565_process_pixblock_tail, \
986 pixman_composite_src_0565_0565_process_pixblock_tail_head, \
987 0, /* dst_w_basereg */ \
988 0, /* dst_r_basereg */ \
989 0, /* src_basereg */ \
992 /******************************************************************************/
994 .macro pixman_composite_src_n_8_process_pixblock_head
997 .macro pixman_composite_src_n_8_process_pixblock_tail
1000 .macro pixman_composite_src_n_8_process_pixblock_tail_head
1001 vst1.8 {d0, d1, d2, d3}, [DST_W, :128]!
1004 .macro pixman_composite_src_n_8_init
1005 add DUMMY, sp, #ARGS_STACK_OFFSET
1006 vld1.32 {d0[0]}, [DUMMY]
1008 vsli.u64 d0, d0, #16
1009 vsli.u64 d0, d0, #32
1014 .macro pixman_composite_src_n_8_cleanup
1017 generate_composite_function \
1018 pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
1019 FLAG_DST_WRITEONLY, \
1020 32, /* number of pixels, processed in a single block */ \
1021 0, /* prefetch distance */ \
1022 pixman_composite_src_n_8_init, \
1023 pixman_composite_src_n_8_cleanup, \
1024 pixman_composite_src_n_8_process_pixblock_head, \
1025 pixman_composite_src_n_8_process_pixblock_tail, \
1026 pixman_composite_src_n_8_process_pixblock_tail_head, \
1027 0, /* dst_w_basereg */ \
1028 0, /* dst_r_basereg */ \
1029 0, /* src_basereg */ \
1030 0 /* mask_basereg */
1032 /******************************************************************************/
1034 .macro pixman_composite_src_n_0565_process_pixblock_head
1037 .macro pixman_composite_src_n_0565_process_pixblock_tail
1040 .macro pixman_composite_src_n_0565_process_pixblock_tail_head
1041 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
1044 .macro pixman_composite_src_n_0565_init
1045 add DUMMY, sp, #ARGS_STACK_OFFSET
1046 vld1.32 {d0[0]}, [DUMMY]
1047 vsli.u64 d0, d0, #16
1048 vsli.u64 d0, d0, #32
1053 .macro pixman_composite_src_n_0565_cleanup
1056 generate_composite_function \
1057 pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
1058 FLAG_DST_WRITEONLY, \
1059 16, /* number of pixels, processed in a single block */ \
1060 0, /* prefetch distance */ \
1061 pixman_composite_src_n_0565_init, \
1062 pixman_composite_src_n_0565_cleanup, \
1063 pixman_composite_src_n_0565_process_pixblock_head, \
1064 pixman_composite_src_n_0565_process_pixblock_tail, \
1065 pixman_composite_src_n_0565_process_pixblock_tail_head, \
1066 0, /* dst_w_basereg */ \
1067 0, /* dst_r_basereg */ \
1068 0, /* src_basereg */ \
1069 0 /* mask_basereg */
1071 /******************************************************************************/
1073 .macro pixman_composite_src_n_8888_process_pixblock_head
1076 .macro pixman_composite_src_n_8888_process_pixblock_tail
1079 .macro pixman_composite_src_n_8888_process_pixblock_tail_head
1080 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1083 .macro pixman_composite_src_n_8888_init
1084 add DUMMY, sp, #ARGS_STACK_OFFSET
1085 vld1.32 {d0[0]}, [DUMMY]
1086 vsli.u64 d0, d0, #32
1091 .macro pixman_composite_src_n_8888_cleanup
1094 generate_composite_function \
1095 pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
1096 FLAG_DST_WRITEONLY, \
1097 8, /* number of pixels, processed in a single block */ \
1098 0, /* prefetch distance */ \
1099 pixman_composite_src_n_8888_init, \
1100 pixman_composite_src_n_8888_cleanup, \
1101 pixman_composite_src_n_8888_process_pixblock_head, \
1102 pixman_composite_src_n_8888_process_pixblock_tail, \
1103 pixman_composite_src_n_8888_process_pixblock_tail_head, \
1104 0, /* dst_w_basereg */ \
1105 0, /* dst_r_basereg */ \
1106 0, /* src_basereg */ \
1107 0 /* mask_basereg */
1109 /******************************************************************************/
1111 .macro pixman_composite_src_8888_8888_process_pixblock_head
1114 .macro pixman_composite_src_8888_8888_process_pixblock_tail
1117 .macro pixman_composite_src_8888_8888_process_pixblock_tail_head
1118 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1123 generate_composite_function \
1124 pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
1125 FLAG_DST_WRITEONLY, \
1126 8, /* number of pixels, processed in a single block */ \
1127 10, /* prefetch distance */ \
1130 pixman_composite_src_8888_8888_process_pixblock_head, \
1131 pixman_composite_src_8888_8888_process_pixblock_tail, \
1132 pixman_composite_src_8888_8888_process_pixblock_tail_head, \
1133 0, /* dst_w_basereg */ \
1134 0, /* dst_r_basereg */ \
1135 0, /* src_basereg */ \
1136 0 /* mask_basereg */
1138 /******************************************************************************/
1140 .macro pixman_composite_src_x888_8888_process_pixblock_head
1145 .macro pixman_composite_src_x888_8888_process_pixblock_tail
1148 .macro pixman_composite_src_x888_8888_process_pixblock_tail_head
1149 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1156 .macro pixman_composite_src_x888_8888_init
1158 vshl.u32 q2, q2, #24
1161 generate_composite_function \
1162 pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
1163 FLAG_DST_WRITEONLY, \
1164 8, /* number of pixels, processed in a single block */ \
1165 10, /* prefetch distance */ \
1166 pixman_composite_src_x888_8888_init, \
1168 pixman_composite_src_x888_8888_process_pixblock_head, \
1169 pixman_composite_src_x888_8888_process_pixblock_tail, \
1170 pixman_composite_src_x888_8888_process_pixblock_tail_head, \
1171 0, /* dst_w_basereg */ \
1172 0, /* dst_r_basereg */ \
1173 0, /* src_basereg */ \
1174 0 /* mask_basereg */
1176 /******************************************************************************/
1178 .macro pixman_composite_over_n_8_8888_process_pixblock_head
1179 /* expecting deinterleaved source data in {d8, d9, d10, d11} */
1180 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1181 /* and destination data in {d4, d5, d6, d7} */
1182 /* mask is in d24 (d25, d26, d27 are unused) */
1185 vmull.u8 q0, d24, d8
1186 vmull.u8 q1, d24, d9
1187 vmull.u8 q6, d24, d10
1188 vmull.u8 q7, d24, d11
1189 vrshr.u16 q10, q0, #8
1190 vrshr.u16 q11, q1, #8
1191 vrshr.u16 q12, q6, #8
1192 vrshr.u16 q13, q7, #8
1193 vraddhn.u16 d0, q0, q10
1194 vraddhn.u16 d1, q1, q11
1195 vraddhn.u16 d2, q6, q12
1196 vraddhn.u16 d3, q7, q13
1197 vmvn.8 d24, d3 /* get inverted alpha */
1198 /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */
1199 /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
1200 /* now do alpha blending */
1201 vmull.u8 q8, d24, d4
1202 vmull.u8 q9, d24, d5
1203 vmull.u8 q10, d24, d6
1204 vmull.u8 q11, d24, d7
1207 .macro pixman_composite_over_n_8_8888_process_pixblock_tail
1208 vrshr.u16 q14, q8, #8
1209 vrshr.u16 q15, q9, #8
1210 vrshr.u16 q12, q10, #8
1211 vrshr.u16 q13, q11, #8
1212 vraddhn.u16 d28, q14, q8
1213 vraddhn.u16 d29, q15, q9
1214 vraddhn.u16 d30, q12, q10
1215 vraddhn.u16 d31, q13, q11
1216 vqadd.u8 q14, q0, q14
1217 vqadd.u8 q15, q1, q15
1220 /* TODO: expand macros and do better instructions scheduling */
1221 .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
1222 pixman_composite_over_n_8_8888_process_pixblock_tail
1223 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1224 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1227 pixman_composite_over_n_8_8888_process_pixblock_head
1230 .macro pixman_composite_over_n_8_8888_init
1231 add DUMMY, sp, #ARGS_STACK_OFFSET
1233 vld1.32 {d11[0]}, [DUMMY]
1240 .macro pixman_composite_over_n_8_8888_cleanup
1244 generate_composite_function \
1245 pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
1246 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1247 8, /* number of pixels, processed in a single block */ \
1248 5, /* prefetch distance */ \
1249 pixman_composite_over_n_8_8888_init, \
1250 pixman_composite_over_n_8_8888_cleanup, \
1251 pixman_composite_over_n_8_8888_process_pixblock_head, \
1252 pixman_composite_over_n_8_8888_process_pixblock_tail, \
1253 pixman_composite_over_n_8_8888_process_pixblock_tail_head
1255 /******************************************************************************/
1257 .macro pixman_composite_over_n_8_8_process_pixblock_head
1258 vmull.u8 q0, d24, d8
1259 vmull.u8 q1, d25, d8
1260 vmull.u8 q6, d26, d8
1261 vmull.u8 q7, d27, d8
1262 vrshr.u16 q10, q0, #8
1263 vrshr.u16 q11, q1, #8
1264 vrshr.u16 q12, q6, #8
1265 vrshr.u16 q13, q7, #8
1266 vraddhn.u16 d0, q0, q10
1267 vraddhn.u16 d1, q1, q11
1268 vraddhn.u16 d2, q6, q12
1269 vraddhn.u16 d3, q7, q13
1272 vmull.u8 q8, d24, d4
1273 vmull.u8 q9, d25, d5
1274 vmull.u8 q10, d26, d6
1275 vmull.u8 q11, d27, d7
1278 .macro pixman_composite_over_n_8_8_process_pixblock_tail
1279 vrshr.u16 q14, q8, #8
1280 vrshr.u16 q15, q9, #8
1281 vrshr.u16 q12, q10, #8
1282 vrshr.u16 q13, q11, #8
1283 vraddhn.u16 d28, q14, q8
1284 vraddhn.u16 d29, q15, q9
1285 vraddhn.u16 d30, q12, q10
1286 vraddhn.u16 d31, q13, q11
1287 vqadd.u8 q14, q0, q14
1288 vqadd.u8 q15, q1, q15
1291 /* TODO: expand macros and do better instructions scheduling */
1292 .macro pixman_composite_over_n_8_8_process_pixblock_tail_head
1293 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1294 pixman_composite_over_n_8_8_process_pixblock_tail
1296 cache_preload 32, 32
1297 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1298 pixman_composite_over_n_8_8_process_pixblock_head
1301 .macro pixman_composite_over_n_8_8_init
1302 add DUMMY, sp, #ARGS_STACK_OFFSET
1304 vld1.32 {d8[0]}, [DUMMY]
1308 .macro pixman_composite_over_n_8_8_cleanup
1312 generate_composite_function \
1313 pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
1314 FLAG_DST_READWRITE, \
1315 32, /* number of pixels, processed in a single block */ \
1316 5, /* prefetch distance */ \
1317 pixman_composite_over_n_8_8_init, \
1318 pixman_composite_over_n_8_8_cleanup, \
1319 pixman_composite_over_n_8_8_process_pixblock_head, \
1320 pixman_composite_over_n_8_8_process_pixblock_tail, \
1321 pixman_composite_over_n_8_8_process_pixblock_tail_head
1323 /******************************************************************************/
1325 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1327 * 'combine_mask_ca' replacement
1329 * input: solid src (n) in {d8, d9, d10, d11}
1330 * dest in {d4, d5, d6, d7 }
1331 * mask in {d24, d25, d26, d27}
1332 * output: updated src in {d0, d1, d2, d3 }
1333 * updated mask in {d24, d25, d26, d3 }
1335 vmull.u8 q0, d24, d8
1336 vmull.u8 q1, d25, d9
1337 vmull.u8 q6, d26, d10
1338 vmull.u8 q7, d27, d11
1339 vmull.u8 q9, d11, d25
1340 vmull.u8 q12, d11, d24
1341 vmull.u8 q13, d11, d26
1342 vrshr.u16 q8, q0, #8
1343 vrshr.u16 q10, q1, #8
1344 vrshr.u16 q11, q6, #8
1345 vraddhn.u16 d0, q0, q8
1346 vraddhn.u16 d1, q1, q10
1347 vraddhn.u16 d2, q6, q11
1348 vrshr.u16 q11, q12, #8
1349 vrshr.u16 q8, q9, #8
1350 vrshr.u16 q6, q13, #8
1351 vrshr.u16 q10, q7, #8
1352 vraddhn.u16 d24, q12, q11
1353 vraddhn.u16 d25, q9, q8
1354 vraddhn.u16 d26, q13, q6
1355 vraddhn.u16 d3, q7, q10
1357 * 'combine_over_ca' replacement
1359 * output: updated dest in {d28, d29, d30, d31}
1363 vmull.u8 q8, d24, d4
1364 vmull.u8 q9, d25, d5
1366 vmull.u8 q10, d26, d6
1367 vmull.u8 q11, d27, d7
1370 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
1371 /* ... continue 'combine_over_ca' replacement */
1372 vrshr.u16 q14, q8, #8
1373 vrshr.u16 q15, q9, #8
1374 vrshr.u16 q6, q10, #8
1375 vrshr.u16 q7, q11, #8
1376 vraddhn.u16 d28, q14, q8
1377 vraddhn.u16 d29, q15, q9
1378 vraddhn.u16 d30, q6, q10
1379 vraddhn.u16 d31, q7, q11
1380 vqadd.u8 q14, q0, q14
1381 vqadd.u8 q15, q1, q15
1384 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1385 vrshr.u16 q14, q8, #8
1386 vrshr.u16 q15, q9, #8
1387 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1388 vrshr.u16 q6, q10, #8
1389 vrshr.u16 q7, q11, #8
1390 vraddhn.u16 d28, q14, q8
1391 vraddhn.u16 d29, q15, q9
1392 vraddhn.u16 d30, q6, q10
1393 vraddhn.u16 d31, q7, q11
1395 vqadd.u8 q14, q0, q14
1396 vqadd.u8 q15, q1, q15
1398 pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1399 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1402 .macro pixman_composite_over_n_8888_8888_ca_init
1403 add DUMMY, sp, #ARGS_STACK_OFFSET
1405 vld1.32 {d11[0]}, [DUMMY]
1412 .macro pixman_composite_over_n_8888_8888_ca_cleanup
1416 generate_composite_function \
1417 pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
1418 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1419 8, /* number of pixels, processed in a single block */ \
1420 5, /* prefetch distance */ \
1421 pixman_composite_over_n_8888_8888_ca_init, \
1422 pixman_composite_over_n_8888_8888_ca_cleanup, \
1423 pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
1424 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
1425 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1427 /******************************************************************************/
1429 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
1431 * 'combine_mask_ca' replacement
1433 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A]
1434 * mask in {d24, d25, d26} [B, G, R]
1435 * output: updated src in {d0, d1, d2 } [B, G, R]
1436 * updated mask in {d24, d25, d26} [B, G, R]
1438 vmull.u8 q0, d24, d8
1439 vmull.u8 q1, d25, d9
1440 vmull.u8 q6, d26, d10
1441 vmull.u8 q9, d11, d25
1442 vmull.u8 q12, d11, d24
1443 vmull.u8 q13, d11, d26
1444 vrshr.u16 q8, q0, #8
1445 vrshr.u16 q10, q1, #8
1446 vrshr.u16 q11, q6, #8
1447 vraddhn.u16 d0, q0, q8
1448 vraddhn.u16 d1, q1, q10
1449 vraddhn.u16 d2, q6, q11
1450 vrshr.u16 q11, q12, #8
1451 vrshr.u16 q8, q9, #8
1452 vrshr.u16 q6, q13, #8
1453 vraddhn.u16 d24, q12, q11
1454 vraddhn.u16 d25, q9, q8
1456 * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
1457 * and put data into d16 - blue, d17 - green, d18 - red
1459 vshrn.u16 d17, q2, #3
1460 vshrn.u16 d18, q2, #8
1461 vraddhn.u16 d26, q13, q6
1463 vsri.u8 d18, d18, #5
1464 vsri.u8 d17, d17, #6
1466 * 'combine_over_ca' replacement
1468 * output: updated dest in d16 - blue, d17 - green, d18 - red
1471 vshrn.u16 d16, q2, #2
1473 vmull.u8 q6, d16, d24
1474 vmull.u8 q7, d17, d25
1475 vmull.u8 q11, d18, d26
1478 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
1479 /* ... continue 'combine_over_ca' replacement */
1480 vrshr.u16 q10, q6, #8
1481 vrshr.u16 q14, q7, #8
1482 vrshr.u16 q15, q11, #8
1483 vraddhn.u16 d16, q10, q6
1484 vraddhn.u16 d17, q14, q7
1485 vraddhn.u16 d18, q15, q11
1487 vqadd.u8 d18, d2, d18
1489 * convert the results in d16, d17, d18 to r5g6b5 and store
1490 * them into {d28, d29}
1492 vshll.u8 q14, d18, #8
1493 vshll.u8 q10, d17, #8
1494 vshll.u8 q15, d16, #8
1495 vsri.u16 q14, q10, #5
1496 vsri.u16 q14, q15, #11
1499 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
1501 vrshr.u16 q10, q6, #8
1502 vrshr.u16 q14, q7, #8
1503 vld1.16 {d4, d5}, [DST_R, :128]!
1504 vrshr.u16 q15, q11, #8
1505 vraddhn.u16 d16, q10, q6
1506 vraddhn.u16 d17, q14, q7
1507 vraddhn.u16 d22, q15, q11
1508 /* process_pixblock_head */
1510 * 'combine_mask_ca' replacement
1512 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A]
1513 * mask in {d24, d25, d26} [B, G, R]
1514 * output: updated src in {d0, d1, d2 } [B, G, R]
1515 * updated mask in {d24, d25, d26} [B, G, R]
1517 vmull.u8 q6, d26, d10
1519 vmull.u8 q0, d24, d8
1520 vqadd.u8 d22, d2, d22
1521 vmull.u8 q1, d25, d9
1523 * convert the result in d16, d17, d22 to r5g6b5 and store
1524 * it into {d28, d29}
1526 vshll.u8 q14, d22, #8
1527 vshll.u8 q10, d17, #8
1528 vshll.u8 q15, d16, #8
1529 vmull.u8 q9, d11, d25
1530 vsri.u16 q14, q10, #5
1531 vmull.u8 q12, d11, d24
1532 vmull.u8 q13, d11, d26
1533 vsri.u16 q14, q15, #11
1535 vrshr.u16 q8, q0, #8
1536 vrshr.u16 q10, q1, #8
1537 vrshr.u16 q11, q6, #8
1538 vraddhn.u16 d0, q0, q8
1539 vraddhn.u16 d1, q1, q10
1540 vraddhn.u16 d2, q6, q11
1541 vrshr.u16 q11, q12, #8
1542 vrshr.u16 q8, q9, #8
1543 vrshr.u16 q6, q13, #8
1544 vraddhn.u16 d24, q12, q11
1545 vraddhn.u16 d25, q9, q8
1547 * convert 8 r5g6b5 pixel data from {d4, d5} to planar
1548 * 8-bit format and put data into d16 - blue, d17 - green,
1551 vshrn.u16 d17, q2, #3
1552 vshrn.u16 d18, q2, #8
1553 vraddhn.u16 d26, q13, q6
1555 vsri.u8 d17, d17, #6
1556 vsri.u8 d18, d18, #5
1558 * 'combine_over_ca' replacement
1560 * output: updated dest in d16 - blue, d17 - green, d18 - red
1563 vshrn.u16 d16, q2, #2
1565 vmull.u8 q7, d17, d25
1566 vmull.u8 q6, d16, d24
1567 vmull.u8 q11, d18, d26
1568 vst1.16 {d28, d29}, [DST_W, :128]!
1571 .macro pixman_composite_over_n_8888_0565_ca_init
1572 add DUMMY, sp, #ARGS_STACK_OFFSET
1574 vld1.32 {d11[0]}, [DUMMY]
1581 .macro pixman_composite_over_n_8888_0565_ca_cleanup
1585 generate_composite_function \
1586 pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
1587 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1588 8, /* number of pixels, processed in a single block */ \
1589 5, /* prefetch distance */ \
1590 pixman_composite_over_n_8888_0565_ca_init, \
1591 pixman_composite_over_n_8888_0565_ca_cleanup, \
1592 pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
1593 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
1594 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
1596 /******************************************************************************/
1598 .macro pixman_composite_in_n_8_process_pixblock_head
1599 /* expecting source data in {d0, d1, d2, d3} */
1600 /* and destination data in {d4, d5, d6, d7} */
1603 vmull.u8 q10, d6, d3
1604 vmull.u8 q11, d7, d3
1607 .macro pixman_composite_in_n_8_process_pixblock_tail
1608 vrshr.u16 q14, q8, #8
1609 vrshr.u16 q15, q9, #8
1610 vrshr.u16 q12, q10, #8
1611 vrshr.u16 q13, q11, #8
1612 vraddhn.u16 d28, q8, q14
1613 vraddhn.u16 d29, q9, q15
1614 vraddhn.u16 d30, q10, q12
1615 vraddhn.u16 d31, q11, q13
1618 .macro pixman_composite_in_n_8_process_pixblock_tail_head
1619 pixman_composite_in_n_8_process_pixblock_tail
1620 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1621 cache_preload 32, 32
1622 pixman_composite_in_n_8_process_pixblock_head
1623 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1626 .macro pixman_composite_in_n_8_init
1627 add DUMMY, sp, #ARGS_STACK_OFFSET
1628 vld1.32 {d3[0]}, [DUMMY]
1632 .macro pixman_composite_in_n_8_cleanup
1635 generate_composite_function \
1636 pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
1637 FLAG_DST_READWRITE, \
1638 32, /* number of pixels, processed in a single block */ \
1639 5, /* prefetch distance */ \
1640 pixman_composite_in_n_8_init, \
1641 pixman_composite_in_n_8_cleanup, \
1642 pixman_composite_in_n_8_process_pixblock_head, \
1643 pixman_composite_in_n_8_process_pixblock_tail, \
1644 pixman_composite_in_n_8_process_pixblock_tail_head, \
1645 28, /* dst_w_basereg */ \
1646 4, /* dst_r_basereg */ \
1647 0, /* src_basereg */ \
1648 24 /* mask_basereg */
1650 .macro pixman_composite_add_n_8_8_process_pixblock_head
1651 /* expecting source data in {d8, d9, d10, d11} */
1652 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1653 /* and destination data in {d4, d5, d6, d7} */
1654 /* mask is in d24, d25, d26, d27 */
1655 vmull.u8 q0, d24, d11
1656 vmull.u8 q1, d25, d11
1657 vmull.u8 q6, d26, d11
1658 vmull.u8 q7, d27, d11
1659 vrshr.u16 q10, q0, #8
1660 vrshr.u16 q11, q1, #8
1661 vrshr.u16 q12, q6, #8
1662 vrshr.u16 q13, q7, #8
1663 vraddhn.u16 d0, q0, q10
1664 vraddhn.u16 d1, q1, q11
1665 vraddhn.u16 d2, q6, q12
1666 vraddhn.u16 d3, q7, q13
1667 vqadd.u8 q14, q0, q2
1668 vqadd.u8 q15, q1, q3
1671 .macro pixman_composite_add_n_8_8_process_pixblock_tail
1674 /* TODO: expand macros and do better instructions scheduling */
1675 .macro pixman_composite_add_n_8_8_process_pixblock_tail_head
1676 pixman_composite_add_n_8_8_process_pixblock_tail
1677 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1678 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1680 cache_preload 32, 32
1681 pixman_composite_add_n_8_8_process_pixblock_head
1684 .macro pixman_composite_add_n_8_8_init
1685 add DUMMY, sp, #ARGS_STACK_OFFSET
1687 vld1.32 {d11[0]}, [DUMMY]
1691 .macro pixman_composite_add_n_8_8_cleanup
1695 generate_composite_function \
1696 pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
1697 FLAG_DST_READWRITE, \
1698 32, /* number of pixels, processed in a single block */ \
1699 5, /* prefetch distance */ \
1700 pixman_composite_add_n_8_8_init, \
1701 pixman_composite_add_n_8_8_cleanup, \
1702 pixman_composite_add_n_8_8_process_pixblock_head, \
1703 pixman_composite_add_n_8_8_process_pixblock_tail, \
1704 pixman_composite_add_n_8_8_process_pixblock_tail_head
1706 /******************************************************************************/
1708 .macro pixman_composite_add_8_8_8_process_pixblock_head
1709 /* expecting source data in {d0, d1, d2, d3} */
1710 /* destination data in {d4, d5, d6, d7} */
1711 /* mask in {d24, d25, d26, d27} */
1712 vmull.u8 q8, d24, d0
1713 vmull.u8 q9, d25, d1
1714 vmull.u8 q10, d26, d2
1715 vmull.u8 q11, d27, d3
1716 vrshr.u16 q0, q8, #8
1717 vrshr.u16 q1, q9, #8
1718 vrshr.u16 q12, q10, #8
1719 vrshr.u16 q13, q11, #8
1720 vraddhn.u16 d0, q0, q8
1721 vraddhn.u16 d1, q1, q9
1722 vraddhn.u16 d2, q12, q10
1723 vraddhn.u16 d3, q13, q11
1724 vqadd.u8 q14, q0, q2
1725 vqadd.u8 q15, q1, q3
1728 .macro pixman_composite_add_8_8_8_process_pixblock_tail
1731 /* TODO: expand macros and do better instructions scheduling */
1732 .macro pixman_composite_add_8_8_8_process_pixblock_tail_head
1733 pixman_composite_add_8_8_8_process_pixblock_tail
1734 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1735 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1738 cache_preload 32, 32
1739 pixman_composite_add_8_8_8_process_pixblock_head
1742 .macro pixman_composite_add_8_8_8_init
1745 .macro pixman_composite_add_8_8_8_cleanup
1748 generate_composite_function \
1749 pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
1750 FLAG_DST_READWRITE, \
1751 32, /* number of pixels, processed in a single block */ \
1752 5, /* prefetch distance */ \
1753 pixman_composite_add_8_8_8_init, \
1754 pixman_composite_add_8_8_8_cleanup, \
1755 pixman_composite_add_8_8_8_process_pixblock_head, \
1756 pixman_composite_add_8_8_8_process_pixblock_tail, \
1757 pixman_composite_add_8_8_8_process_pixblock_tail_head
1759 /******************************************************************************/
1761 .macro pixman_composite_add_8888_8888_8888_process_pixblock_head
1762 /* expecting source data in {d0, d1, d2, d3} */
1763 /* destination data in {d4, d5, d6, d7} */
1764 /* mask in {d24, d25, d26, d27} */
1765 vmull.u8 q8, d27, d0
1766 vmull.u8 q9, d27, d1
1767 vmull.u8 q10, d27, d2
1768 vmull.u8 q11, d27, d3
1769 /* 1 cycle bubble */
1770 vrsra.u16 q8, q8, #8
1771 vrsra.u16 q9, q9, #8
1772 vrsra.u16 q10, q10, #8
1773 vrsra.u16 q11, q11, #8
1776 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
1777 /* 2 cycle bubble */
1778 vrshrn.u16 d28, q8, #8
1779 vrshrn.u16 d29, q9, #8
1780 vrshrn.u16 d30, q10, #8
1781 vrshrn.u16 d31, q11, #8
1782 vqadd.u8 q14, q2, q14
1783 /* 1 cycle bubble */
1784 vqadd.u8 q15, q3, q15
1787 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1789 vrshrn.u16 d28, q8, #8
1791 vrshrn.u16 d29, q9, #8
1792 vmull.u8 q8, d27, d0
1793 vrshrn.u16 d30, q10, #8
1794 vmull.u8 q9, d27, d1
1795 vrshrn.u16 d31, q11, #8
1796 vmull.u8 q10, d27, d2
1797 vqadd.u8 q14, q2, q14
1798 vmull.u8 q11, d27, d3
1799 vqadd.u8 q15, q3, q15
1800 vrsra.u16 q8, q8, #8
1801 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1802 vrsra.u16 q9, q9, #8
1803 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1804 vrsra.u16 q10, q10, #8
1808 vrsra.u16 q11, q11, #8
1811 generate_composite_function \
1812 pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
1813 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1814 8, /* number of pixels, processed in a single block */ \
1815 10, /* prefetch distance */ \
1818 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1819 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1820 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1822 generate_composite_function_single_scanline \
1823 pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
1824 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1825 8, /* number of pixels, processed in a single block */ \
1828 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1829 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1830 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1832 /******************************************************************************/
1834 generate_composite_function \
1835 pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
1836 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1837 8, /* number of pixels, processed in a single block */ \
1838 5, /* prefetch distance */ \
1841 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1842 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1843 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
1844 28, /* dst_w_basereg */ \
1845 4, /* dst_r_basereg */ \
1846 0, /* src_basereg */ \
1847 27 /* mask_basereg */
1849 /******************************************************************************/
1851 .macro pixman_composite_add_n_8_8888_init
1852 add DUMMY, sp, #ARGS_STACK_OFFSET
1853 vld1.32 {d3[0]}, [DUMMY]
1860 .macro pixman_composite_add_n_8_8888_cleanup
1863 generate_composite_function \
1864 pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
1865 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1866 8, /* number of pixels, processed in a single block */ \
1867 5, /* prefetch distance */ \
1868 pixman_composite_add_n_8_8888_init, \
1869 pixman_composite_add_n_8_8888_cleanup, \
1870 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1871 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1872 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
1873 28, /* dst_w_basereg */ \
1874 4, /* dst_r_basereg */ \
1875 0, /* src_basereg */ \
1876 27 /* mask_basereg */
1878 /******************************************************************************/
1880 .macro pixman_composite_add_8888_n_8888_init
1881 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
1882 vld1.32 {d27[0]}, [DUMMY]
1886 .macro pixman_composite_add_8888_n_8888_cleanup
1889 generate_composite_function \
1890 pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
1891 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1892 8, /* number of pixels, processed in a single block */ \
1893 5, /* prefetch distance */ \
1894 pixman_composite_add_8888_n_8888_init, \
1895 pixman_composite_add_8888_n_8888_cleanup, \
1896 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1897 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1898 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
1899 28, /* dst_w_basereg */ \
1900 4, /* dst_r_basereg */ \
1901 0, /* src_basereg */ \
1902 27 /* mask_basereg */
1904 /******************************************************************************/
1906 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
1907 /* expecting source data in {d0, d1, d2, d3} */
1908 /* destination data in {d4, d5, d6, d7} */
1909 /* solid mask is in d15 */
1912 vmull.u8 q8, d15, d3
1913 vmull.u8 q6, d15, d2
1914 vmull.u8 q5, d15, d1
1915 vmull.u8 q4, d15, d0
1916 vrshr.u16 q13, q8, #8
1917 vrshr.u16 q12, q6, #8
1918 vrshr.u16 q11, q5, #8
1919 vrshr.u16 q10, q4, #8
1920 vraddhn.u16 d3, q8, q13
1921 vraddhn.u16 d2, q6, q12
1922 vraddhn.u16 d1, q5, q11
1923 vraddhn.u16 d0, q4, q10
1924 vmvn.8 d24, d3 /* get inverted alpha */
1925 /* now do alpha blending */
1926 vmull.u8 q8, d24, d4
1927 vmull.u8 q9, d24, d5
1928 vmull.u8 q10, d24, d6
1929 vmull.u8 q11, d24, d7
1932 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
1933 vrshr.u16 q14, q8, #8
1934 vrshr.u16 q15, q9, #8
1935 vrshr.u16 q12, q10, #8
1936 vrshr.u16 q13, q11, #8
1937 vraddhn.u16 d28, q14, q8
1938 vraddhn.u16 d29, q15, q9
1939 vraddhn.u16 d30, q12, q10
1940 vraddhn.u16 d31, q13, q11
1943 /* TODO: expand macros and do better instructions scheduling */
1944 .macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
1945 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1946 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
1950 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
1951 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1954 generate_composite_function_single_scanline \
1955 pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
1956 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1957 8, /* number of pixels, processed in a single block */ \
1958 default_init_need_all_regs, \
1959 default_cleanup_need_all_regs, \
1960 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
1961 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
1962 pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
1963 28, /* dst_w_basereg */ \
1964 4, /* dst_r_basereg */ \
1965 0, /* src_basereg */ \
1966 12 /* mask_basereg */
1968 /******************************************************************************/
1970 .macro pixman_composite_over_8888_n_8888_process_pixblock_head
1971 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
1974 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail
1975 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
1976 vqadd.u8 q14, q0, q14
1977 vqadd.u8 q15, q1, q15
1980 /* TODO: expand macros and do better instructions scheduling */
1981 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
1982 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1983 pixman_composite_over_8888_n_8888_process_pixblock_tail
1986 pixman_composite_over_8888_n_8888_process_pixblock_head
1987 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1990 .macro pixman_composite_over_8888_n_8888_init
1993 vld1.32 {d15[0]}, [DUMMY]
1997 .macro pixman_composite_over_8888_n_8888_cleanup
2001 generate_composite_function \
2002 pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
2003 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2004 8, /* number of pixels, processed in a single block */ \
2005 5, /* prefetch distance */ \
2006 pixman_composite_over_8888_n_8888_init, \
2007 pixman_composite_over_8888_n_8888_cleanup, \
2008 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2009 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2010 pixman_composite_over_8888_n_8888_process_pixblock_tail_head
2012 /******************************************************************************/
2014 /* TODO: expand macros and do better instructions scheduling */
2015 .macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
2016 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2017 pixman_composite_over_8888_n_8888_process_pixblock_tail
2021 pixman_composite_over_8888_n_8888_process_pixblock_head
2022 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2025 generate_composite_function \
2026 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
2027 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2028 8, /* number of pixels, processed in a single block */ \
2029 5, /* prefetch distance */ \
2030 default_init_need_all_regs, \
2031 default_cleanup_need_all_regs, \
2032 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2033 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2034 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
2035 28, /* dst_w_basereg */ \
2036 4, /* dst_r_basereg */ \
2037 0, /* src_basereg */ \
2038 12 /* mask_basereg */
2040 generate_composite_function_single_scanline \
2041 pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
2042 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2043 8, /* number of pixels, processed in a single block */ \
2044 default_init_need_all_regs, \
2045 default_cleanup_need_all_regs, \
2046 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2047 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2048 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
2049 28, /* dst_w_basereg */ \
2050 4, /* dst_r_basereg */ \
2051 0, /* src_basereg */ \
2052 12 /* mask_basereg */
2054 /******************************************************************************/
2056 /* TODO: expand macros and do better instructions scheduling */
2057 .macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
2058 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2059 pixman_composite_over_8888_n_8888_process_pixblock_tail
2063 pixman_composite_over_8888_n_8888_process_pixblock_head
2064 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2067 generate_composite_function \
2068 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
2069 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2070 8, /* number of pixels, processed in a single block */ \
2071 5, /* prefetch distance */ \
2072 default_init_need_all_regs, \
2073 default_cleanup_need_all_regs, \
2074 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2075 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2076 pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
2077 28, /* dst_w_basereg */ \
2078 4, /* dst_r_basereg */ \
2079 0, /* src_basereg */ \
2080 15 /* mask_basereg */
2082 /******************************************************************************/
2084 .macro pixman_composite_src_0888_0888_process_pixblock_head
2087 .macro pixman_composite_src_0888_0888_process_pixblock_tail
2090 .macro pixman_composite_src_0888_0888_process_pixblock_tail_head
2091 vst3.8 {d0, d1, d2}, [DST_W]!
2096 generate_composite_function \
2097 pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
2098 FLAG_DST_WRITEONLY, \
2099 8, /* number of pixels, processed in a single block */ \
2100 10, /* prefetch distance */ \
2103 pixman_composite_src_0888_0888_process_pixblock_head, \
2104 pixman_composite_src_0888_0888_process_pixblock_tail, \
2105 pixman_composite_src_0888_0888_process_pixblock_tail_head, \
2106 0, /* dst_w_basereg */ \
2107 0, /* dst_r_basereg */ \
2108 0, /* src_basereg */ \
2109 0 /* mask_basereg */
2111 /******************************************************************************/
2113 .macro pixman_composite_src_0888_8888_rev_process_pixblock_head
2117 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
2120 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
2121 vst4.8 {d0, d1, d2, d3}, [DST_W]!
2127 .macro pixman_composite_src_0888_8888_rev_init
2131 generate_composite_function \
2132 pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
2133 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2134 8, /* number of pixels, processed in a single block */ \
2135 10, /* prefetch distance */ \
2136 pixman_composite_src_0888_8888_rev_init, \
2138 pixman_composite_src_0888_8888_rev_process_pixblock_head, \
2139 pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
2140 pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
2141 0, /* dst_w_basereg */ \
2142 0, /* dst_r_basereg */ \
2143 0, /* src_basereg */ \
2144 0 /* mask_basereg */
2146 /******************************************************************************/
2148 .macro pixman_composite_src_0888_0565_rev_process_pixblock_head
2153 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
2154 vshll.u8 q14, d0, #8
2155 vsri.u16 q14, q8, #5
2156 vsri.u16 q14, q9, #11
2159 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
2160 vshll.u8 q14, d0, #8
2162 vsri.u16 q14, q8, #5
2163 vsri.u16 q14, q9, #11
2165 vst1.16 {d28, d29}, [DST_W, :128]!
2169 generate_composite_function \
2170 pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
2171 FLAG_DST_WRITEONLY, \
2172 8, /* number of pixels, processed in a single block */ \
2173 10, /* prefetch distance */ \
2176 pixman_composite_src_0888_0565_rev_process_pixblock_head, \
2177 pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
2178 pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
2179 28, /* dst_w_basereg */ \
2180 0, /* dst_r_basereg */ \
2181 0, /* src_basereg */ \
2182 0 /* mask_basereg */
2184 /******************************************************************************/
2186 .macro pixman_composite_src_pixbuf_8888_process_pixblock_head
2189 vmull.u8 q10, d3, d2
2192 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
2193 vrshr.u16 q11, q8, #8
2195 vrshr.u16 q12, q9, #8
2196 vrshr.u16 q13, q10, #8
2197 vraddhn.u16 d30, q11, q8
2198 vraddhn.u16 d29, q12, q9
2199 vraddhn.u16 d28, q13, q10
2202 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
2203 vrshr.u16 q11, q8, #8
2205 vrshr.u16 q12, q9, #8
2206 vrshr.u16 q13, q10, #8
2208 vraddhn.u16 d30, q11, q8
2209 PF add PF_X, PF_X, #8
2211 PF addne PF_X, PF_X, #8
2212 PF subne PF_CTL, PF_CTL, #1
2213 vraddhn.u16 d29, q12, q9
2214 vraddhn.u16 d28, q13, q10
2217 vmull.u8 q10, d3, d2
2218 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2220 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
2221 PF subge PF_X, PF_X, ORIG_W
2222 PF subges PF_CTL, PF_CTL, #0x10
2223 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2226 generate_composite_function \
2227 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
2228 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2229 8, /* number of pixels, processed in a single block */ \
2230 10, /* prefetch distance */ \
2233 pixman_composite_src_pixbuf_8888_process_pixblock_head, \
2234 pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
2235 pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
2236 28, /* dst_w_basereg */ \
2237 0, /* dst_r_basereg */ \
2238 0, /* src_basereg */ \
2239 0 /* mask_basereg */
2241 /******************************************************************************/
2243 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
2246 vmull.u8 q10, d3, d2
2249 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
2250 vrshr.u16 q11, q8, #8
2252 vrshr.u16 q12, q9, #8
2253 vrshr.u16 q13, q10, #8
2254 vraddhn.u16 d28, q11, q8
2255 vraddhn.u16 d29, q12, q9
2256 vraddhn.u16 d30, q13, q10
2259 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
2260 vrshr.u16 q11, q8, #8
2262 vrshr.u16 q12, q9, #8
2263 vrshr.u16 q13, q10, #8
2265 vraddhn.u16 d28, q11, q8
2266 PF add PF_X, PF_X, #8
2268 PF addne PF_X, PF_X, #8
2269 PF subne PF_CTL, PF_CTL, #1
2270 vraddhn.u16 d29, q12, q9
2271 vraddhn.u16 d30, q13, q10
2274 vmull.u8 q10, d3, d2
2275 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2277 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
2278 PF subge PF_X, PF_X, ORIG_W
2279 PF subges PF_CTL, PF_CTL, #0x10
2280 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2283 generate_composite_function \
2284 pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
2285 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2286 8, /* number of pixels, processed in a single block */ \
2287 10, /* prefetch distance */ \
2290 pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
2291 pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
2292 pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
2293 28, /* dst_w_basereg */ \
2294 0, /* dst_r_basereg */ \
2295 0, /* src_basereg */ \
2296 0 /* mask_basereg */
2298 /******************************************************************************/
2300 .macro pixman_composite_over_0565_8_0565_process_pixblock_head
2301 /* mask is in d15 */
2302 convert_0565_to_x888 q4, d2, d1, d0
2303 convert_0565_to_x888 q5, d6, d5, d4
2304 /* source pixel data is in {d0, d1, d2, XX} */
2305 /* destination pixel data is in {d4, d5, d6, XX} */
2307 vmull.u8 q6, d15, d2
2308 vmull.u8 q5, d15, d1
2309 vmull.u8 q4, d15, d0
2312 vmull.u8 q13, d7, d6
2313 vrshr.u16 q12, q6, #8
2314 vrshr.u16 q11, q5, #8
2315 vrshr.u16 q10, q4, #8
2316 vraddhn.u16 d2, q6, q12
2317 vraddhn.u16 d1, q5, q11
2318 vraddhn.u16 d0, q4, q10
2321 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail
2322 vrshr.u16 q14, q8, #8
2323 vrshr.u16 q15, q9, #8
2324 vrshr.u16 q12, q13, #8
2325 vraddhn.u16 d28, q14, q8
2326 vraddhn.u16 d29, q15, q9
2327 vraddhn.u16 d30, q12, q13
2328 vqadd.u8 q0, q0, q14
2329 vqadd.u8 q1, q1, q15
2330 /* 32bpp result is in {d0, d1, d2, XX} */
2331 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2334 /* TODO: expand macros and do better instructions scheduling */
2335 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
2337 pixman_composite_over_0565_8_0565_process_pixblock_tail
2339 vld1.16 {d10, d11}, [DST_R, :128]!
2341 pixman_composite_over_0565_8_0565_process_pixblock_head
2342 vst1.16 {d28, d29}, [DST_W, :128]!
2345 generate_composite_function \
2346 pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
2347 FLAG_DST_READWRITE, \
2348 8, /* number of pixels, processed in a single block */ \
2349 5, /* prefetch distance */ \
2350 default_init_need_all_regs, \
2351 default_cleanup_need_all_regs, \
2352 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2353 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2354 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2355 28, /* dst_w_basereg */ \
2356 10, /* dst_r_basereg */ \
2357 8, /* src_basereg */ \
2358 15 /* mask_basereg */
2360 /******************************************************************************/
2362 .macro pixman_composite_over_0565_n_0565_init
2363 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
2365 vld1.32 {d15[0]}, [DUMMY]
2369 .macro pixman_composite_over_0565_n_0565_cleanup
2373 generate_composite_function \
2374 pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
2375 FLAG_DST_READWRITE, \
2376 8, /* number of pixels, processed in a single block */ \
2377 5, /* prefetch distance */ \
2378 pixman_composite_over_0565_n_0565_init, \
2379 pixman_composite_over_0565_n_0565_cleanup, \
2380 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2381 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2382 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2383 28, /* dst_w_basereg */ \
2384 10, /* dst_r_basereg */ \
2385 8, /* src_basereg */ \
2386 15 /* mask_basereg */
2388 /******************************************************************************/
2390 .macro pixman_composite_add_0565_8_0565_process_pixblock_head
2391 /* mask is in d15 */
2392 convert_0565_to_x888 q4, d2, d1, d0
2393 convert_0565_to_x888 q5, d6, d5, d4
2394 /* source pixel data is in {d0, d1, d2, XX} */
2395 /* destination pixel data is in {d4, d5, d6, XX} */
2396 vmull.u8 q6, d15, d2
2397 vmull.u8 q5, d15, d1
2398 vmull.u8 q4, d15, d0
2399 vrshr.u16 q12, q6, #8
2400 vrshr.u16 q11, q5, #8
2401 vrshr.u16 q10, q4, #8
2402 vraddhn.u16 d2, q6, q12
2403 vraddhn.u16 d1, q5, q11
2404 vraddhn.u16 d0, q4, q10
2407 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail
2410 /* 32bpp result is in {d0, d1, d2, XX} */
2411 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2414 /* TODO: expand macros and do better instructions scheduling */
2415 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
2417 pixman_composite_add_0565_8_0565_process_pixblock_tail
2419 vld1.16 {d10, d11}, [DST_R, :128]!
2421 pixman_composite_add_0565_8_0565_process_pixblock_head
2422 vst1.16 {d28, d29}, [DST_W, :128]!
2425 generate_composite_function \
2426 pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
2427 FLAG_DST_READWRITE, \
2428 8, /* number of pixels, processed in a single block */ \
2429 5, /* prefetch distance */ \
2430 default_init_need_all_regs, \
2431 default_cleanup_need_all_regs, \
2432 pixman_composite_add_0565_8_0565_process_pixblock_head, \
2433 pixman_composite_add_0565_8_0565_process_pixblock_tail, \
2434 pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
2435 28, /* dst_w_basereg */ \
2436 10, /* dst_r_basereg */ \
2437 8, /* src_basereg */ \
2438 15 /* mask_basereg */
2440 /******************************************************************************/
2442 .macro pixman_composite_out_reverse_8_0565_process_pixblock_head
2443 /* mask is in d15 */
2444 convert_0565_to_x888 q5, d6, d5, d4
2445 /* destination pixel data is in {d4, d5, d6, xx} */
2446 vmvn.8 d24, d15 /* get inverted alpha */
2447 /* now do alpha blending */
2448 vmull.u8 q8, d24, d4
2449 vmull.u8 q9, d24, d5
2450 vmull.u8 q10, d24, d6
2453 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
2454 vrshr.u16 q14, q8, #8
2455 vrshr.u16 q15, q9, #8
2456 vrshr.u16 q12, q10, #8
2457 vraddhn.u16 d0, q14, q8
2458 vraddhn.u16 d1, q15, q9
2459 vraddhn.u16 d2, q12, q10
2460 /* 32bpp result is in {d0, d1, d2, XX} */
2461 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2464 /* TODO: expand macros and do better instructions scheduling */
2465 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
2467 pixman_composite_out_reverse_8_0565_process_pixblock_tail
2468 vld1.16 {d10, d11}, [DST_R, :128]!
2470 pixman_composite_out_reverse_8_0565_process_pixblock_head
2471 vst1.16 {d28, d29}, [DST_W, :128]!
2474 generate_composite_function \
2475 pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
2476 FLAG_DST_READWRITE, \
2477 8, /* number of pixels, processed in a single block */ \
2478 5, /* prefetch distance */ \
2479 default_init_need_all_regs, \
2480 default_cleanup_need_all_regs, \
2481 pixman_composite_out_reverse_8_0565_process_pixblock_head, \
2482 pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
2483 pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
2484 28, /* dst_w_basereg */ \
2485 10, /* dst_r_basereg */ \
2486 15, /* src_basereg */ \
2487 0 /* mask_basereg */
2489 /******************************************************************************/
2491 generate_composite_function_nearest_scanline \
2492 pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
2493 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2494 8, /* number of pixels, processed in a single block */ \
2497 pixman_composite_over_8888_8888_process_pixblock_head, \
2498 pixman_composite_over_8888_8888_process_pixblock_tail, \
2499 pixman_composite_over_8888_8888_process_pixblock_tail_head
2501 generate_composite_function_nearest_scanline \
2502 pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
2503 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2504 8, /* number of pixels, processed in a single block */ \
2507 pixman_composite_over_8888_0565_process_pixblock_head, \
2508 pixman_composite_over_8888_0565_process_pixblock_tail, \
2509 pixman_composite_over_8888_0565_process_pixblock_tail_head, \
2510 28, /* dst_w_basereg */ \
2511 4, /* dst_r_basereg */ \
2512 0, /* src_basereg */ \
2513 24 /* mask_basereg */
2515 generate_composite_function_nearest_scanline \
2516 pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
2517 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2518 8, /* number of pixels, processed in a single block */ \
2521 pixman_composite_src_8888_0565_process_pixblock_head, \
2522 pixman_composite_src_8888_0565_process_pixblock_tail, \
2523 pixman_composite_src_8888_0565_process_pixblock_tail_head
2525 generate_composite_function_nearest_scanline \
2526 pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
2527 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2528 8, /* number of pixels, processed in a single block */ \
2531 pixman_composite_src_0565_8888_process_pixblock_head, \
2532 pixman_composite_src_0565_8888_process_pixblock_tail, \
2533 pixman_composite_src_0565_8888_process_pixblock_tail_head
2535 generate_composite_function_nearest_scanline \
2536 pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
2537 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2538 8, /* number of pixels, processed in a single block */ \
2539 default_init_need_all_regs, \
2540 default_cleanup_need_all_regs, \
2541 pixman_composite_over_8888_8_0565_process_pixblock_head, \
2542 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
2543 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
2544 28, /* dst_w_basereg */ \
2545 4, /* dst_r_basereg */ \
2546 8, /* src_basereg */ \
2547 24 /* mask_basereg */
2549 generate_composite_function_nearest_scanline \
2550 pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
2551 FLAG_DST_READWRITE, \
2552 8, /* number of pixels, processed in a single block */ \
2553 default_init_need_all_regs, \
2554 default_cleanup_need_all_regs, \
2555 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2556 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2557 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2558 28, /* dst_w_basereg */ \
2559 10, /* dst_r_basereg */ \
2560 8, /* src_basereg */ \
2561 15 /* mask_basereg */
2563 /******************************************************************************/
2565 /* Supplementary macro for setting function attributes */
2566 .macro pixman_asm_function fname
2571 .type fname, %function
2577 * Bilinear scaling support code which tries to provide pixel fetching, color
2578 * format conversion, and interpolation as separate macros which can be used
2579 * as the basic building blocks for constructing bilinear scanline functions.
2582 .macro bilinear_load_8888 reg1, reg2, tmp
2583 mov TMP1, X, asr #16
2585 add TMP1, TOP, TMP1, asl #2
2586 vld1.32 {reg1}, [TMP1], STRIDE
2587 vld1.32 {reg2}, [TMP1]
2590 .macro bilinear_load_0565 reg1, reg2, tmp
2591 mov TMP1, X, asr #16
2593 add TMP1, TOP, TMP1, asl #1
2594 vld1.32 {reg2[0]}, [TMP1], STRIDE
2595 vld1.32 {reg2[1]}, [TMP1]
2596 convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
2599 .macro bilinear_load_and_vertical_interpolate_two_8888 \
2600 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
2602 bilinear_load_8888 reg1, reg2, tmp1
2603 vmull.u8 acc1, reg1, d28
2604 vmlal.u8 acc1, reg2, d29
2605 bilinear_load_8888 reg3, reg4, tmp2
2606 vmull.u8 acc2, reg3, d28
2607 vmlal.u8 acc2, reg4, d29
2610 .macro bilinear_load_and_vertical_interpolate_four_8888 \
2611 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2612 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2614 bilinear_load_and_vertical_interpolate_two_8888 \
2615 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
2616 bilinear_load_and_vertical_interpolate_two_8888 \
2617 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2620 .macro bilinear_load_and_vertical_interpolate_two_0565 \
2621 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
2623 mov TMP1, X, asr #16
2625 add TMP1, TOP, TMP1, asl #1
2626 mov TMP2, X, asr #16
2628 add TMP2, TOP, TMP2, asl #1
2629 vld1.32 {acc2lo[0]}, [TMP1], STRIDE
2630 vld1.32 {acc2hi[0]}, [TMP2], STRIDE
2631 vld1.32 {acc2lo[1]}, [TMP1]
2632 vld1.32 {acc2hi[1]}, [TMP2]
2633 convert_0565_to_x888 acc2, reg3, reg2, reg1
2638 vmull.u8 acc1, reg1, d28
2639 vmlal.u8 acc1, reg2, d29
2640 vmull.u8 acc2, reg3, d28
2641 vmlal.u8 acc2, reg4, d29
2644 .macro bilinear_load_and_vertical_interpolate_four_0565 \
2645 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2646 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2648 mov TMP1, X, asr #16
2650 add TMP1, TOP, TMP1, asl #1
2651 mov TMP2, X, asr #16
2653 add TMP2, TOP, TMP2, asl #1
2654 vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
2655 vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
2656 vld1.32 {xacc2lo[1]}, [TMP1]
2657 vld1.32 {xacc2hi[1]}, [TMP2]
2658 convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
2659 mov TMP1, X, asr #16
2661 add TMP1, TOP, TMP1, asl #1
2662 mov TMP2, X, asr #16
2664 add TMP2, TOP, TMP2, asl #1
2665 vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
2666 vzip.u8 xreg1, xreg3
2667 vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
2668 vzip.u8 xreg2, xreg4
2669 vld1.32 {yacc2lo[1]}, [TMP1]
2670 vzip.u8 xreg3, xreg4
2671 vld1.32 {yacc2hi[1]}, [TMP2]
2672 vzip.u8 xreg1, xreg2
2673 convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
2674 vmull.u8 xacc1, xreg1, d28
2675 vzip.u8 yreg1, yreg3
2676 vmlal.u8 xacc1, xreg2, d29
2677 vzip.u8 yreg2, yreg4
2678 vmull.u8 xacc2, xreg3, d28
2679 vzip.u8 yreg3, yreg4
2680 vmlal.u8 xacc2, xreg4, d29
2681 vzip.u8 yreg1, yreg2
2682 vmull.u8 yacc1, yreg1, d28
2683 vmlal.u8 yacc1, yreg2, d29
2684 vmull.u8 yacc2, yreg3, d28
2685 vmlal.u8 yacc2, yreg4, d29
2688 .macro bilinear_store_8888 numpix, tmp1, tmp2
2690 vst1.32 {d0, d1}, [OUT, :128]!
2692 vst1.32 {d0}, [OUT, :64]!
2694 vst1.32 {d0[0]}, [OUT, :32]!
2696 .error bilinear_store_8888 numpix is unsupported
2700 .macro bilinear_store_0565 numpix, tmp1, tmp2
2705 convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
2707 vst1.16 {d2}, [OUT, :64]!
2709 vst1.32 {d2[0]}, [OUT, :32]!
2711 vst1.16 {d2[0]}, [OUT, :16]!
2713 .error bilinear_store_0565 numpix is unsupported
2717 .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
2718 bilinear_load_&src_fmt d0, d1, d2
2719 vmull.u8 q1, d0, d28
2720 vmlal.u8 q1, d1, d29
2721 /* 5 cycles bubble */
2722 vshll.u16 q0, d2, #8
2723 vmlsl.u16 q0, d2, d30
2724 vmlal.u16 q0, d3, d30
2725 /* 5 cycles bubble */
2726 vshrn.u32 d0, q0, #16
2727 /* 3 cycles bubble */
2729 /* 1 cycle bubble */
2730 bilinear_store_&dst_fmt 1, q2, q3
2733 .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
2734 bilinear_load_and_vertical_interpolate_two_&src_fmt \
2735 q1, q11, d0, d1, d20, d21, d22, d23
2736 vshll.u16 q0, d2, #8
2737 vmlsl.u16 q0, d2, d30
2738 vmlal.u16 q0, d3, d30
2739 vshll.u16 q10, d22, #8
2740 vmlsl.u16 q10, d22, d31
2741 vmlal.u16 q10, d23, d31
2742 vshrn.u32 d0, q0, #16
2743 vshrn.u32 d1, q10, #16
2744 vshr.u16 q15, q12, #8
2745 vadd.u16 q12, q12, q13
2747 bilinear_store_&dst_fmt 2, q2, q3
2750 .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
2751 bilinear_load_and_vertical_interpolate_four_&src_fmt \
2752 q1, q11, d0, d1, d20, d21, d22, d23 \
2753 q3, q9, d4, d5, d16, d17, d18, d19
2755 sub TMP1, TMP1, STRIDE
2756 vshll.u16 q0, d2, #8
2757 vmlsl.u16 q0, d2, d30
2758 vmlal.u16 q0, d3, d30
2759 vshll.u16 q10, d22, #8
2760 vmlsl.u16 q10, d22, d31
2761 vmlal.u16 q10, d23, d31
2762 vshr.u16 q15, q12, #8
2763 vshll.u16 q2, d6, #8
2764 vmlsl.u16 q2, d6, d30
2765 vmlal.u16 q2, d7, d30
2766 vshll.u16 q8, d18, #8
2768 vmlsl.u16 q8, d18, d31
2769 vmlal.u16 q8, d19, d31
2770 vadd.u16 q12, q12, q13
2771 vshrn.u32 d0, q0, #16
2772 vshrn.u32 d1, q10, #16
2773 vshrn.u32 d4, q2, #16
2774 vshrn.u32 d5, q8, #16
2775 vshr.u16 q15, q12, #8
2778 vadd.u16 q12, q12, q13
2779 bilinear_store_&dst_fmt 4, q2, q3
2782 .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
2783 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
2784 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
2786 bilinear_interpolate_four_pixels src_fmt, dst_fmt
2790 .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
2791 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
2792 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
2796 .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
2797 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
2798 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
2800 bilinear_interpolate_four_pixels src_fmt, dst_fmt
2804 .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
2805 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
2806 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
2808 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
2809 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
2813 .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
2814 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
2815 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
2817 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
2821 .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
2822 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
2823 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
2825 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
2826 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
2830 .set BILINEAR_FLAG_UNROLL_4, 0
2831 .set BILINEAR_FLAG_UNROLL_8, 1
2832 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
2835 * Main template macro for generating NEON optimized bilinear scanline
2838 * Bilinear scanline scaler macro template uses the following arguments:
2839 * fname - name of the function to generate
2840 * src_fmt - source color format (8888 or 0565)
2841 * dst_fmt - destination color format (8888 or 0565)
2842 * bpp_shift - (1 << bpp_shift) is the size of source pixel in bytes
2843 * prefetch_distance - prefetch in the source image by that many
2847 .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
2848 src_bpp_shift, dst_bpp_shift, \
2849 prefetch_distance, flags
2851 pixman_asm_function fname
2868 push {r4, r5, r6, r7, r8, r9}
2869 mov PF_OFFS, #prefetch_distance
2870 ldmia ip, {WB, X, UX, WIDTH}
2871 mul PF_OFFS, PF_OFFS, UX
2873 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
2877 sub STRIDE, BOTTOM, TOP
2887 vadd.u16 d25, d25, d26
2889 /* ensure good destination alignment */
2892 tst OUT, #(1 << dst_bpp_shift)
2894 vshr.u16 q15, q12, #8
2895 vadd.u16 q12, q12, q13
2896 bilinear_interpolate_last_pixel src_fmt, dst_fmt
2897 sub WIDTH, WIDTH, #1
2899 vadd.u16 q13, q13, q13
2900 vshr.u16 q15, q12, #8
2901 vadd.u16 q12, q12, q13
2905 tst OUT, #(1 << (dst_bpp_shift + 1))
2907 bilinear_interpolate_two_pixels src_fmt, dst_fmt
2908 sub WIDTH, WIDTH, #2
2910 .if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
2911 /*********** 8 pixels per iteration *****************/
2914 tst OUT, #(1 << (dst_bpp_shift + 2))
2916 bilinear_interpolate_four_pixels src_fmt, dst_fmt
2917 sub WIDTH, WIDTH, #4
2919 subs WIDTH, WIDTH, #8
2921 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
2922 bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
2923 subs WIDTH, WIDTH, #8
2926 bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
2927 subs WIDTH, WIDTH, #8
2930 bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
2934 bilinear_interpolate_four_pixels src_fmt, dst_fmt
2937 /*********** 4 pixels per iteration *****************/
2938 subs WIDTH, WIDTH, #4
2940 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
2941 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
2942 subs WIDTH, WIDTH, #4
2945 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
2946 subs WIDTH, WIDTH, #4
2949 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
2951 /****************************************************/
2953 /* handle the remaining trailing pixels */
2956 bilinear_interpolate_two_pixels src_fmt, dst_fmt
2960 bilinear_interpolate_last_pixel src_fmt, dst_fmt
2962 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
2965 pop {r4, r5, r6, r7, r8, r9}
2985 /*****************************************************************************/
2987 .set have_bilinear_interpolate_four_pixels_8888_8888, 1
2989 .macro bilinear_interpolate_four_pixels_8888_8888_head
2990 mov TMP1, X, asr #16
2992 add TMP1, TOP, TMP1, asl #2
2993 mov TMP2, X, asr #16
2995 add TMP2, TOP, TMP2, asl #2
2997 vld1.32 {d22}, [TMP1], STRIDE
2998 vld1.32 {d23}, [TMP1]
2999 mov TMP3, X, asr #16
3001 add TMP3, TOP, TMP3, asl #2
3002 vmull.u8 q8, d22, d28
3003 vmlal.u8 q8, d23, d29
3005 vld1.32 {d22}, [TMP2], STRIDE
3006 vld1.32 {d23}, [TMP2]
3007 mov TMP4, X, asr #16
3009 add TMP4, TOP, TMP4, asl #2
3010 vmull.u8 q9, d22, d28
3011 vmlal.u8 q9, d23, d29
3013 vld1.32 {d22}, [TMP3], STRIDE
3014 vld1.32 {d23}, [TMP3]
3015 vmull.u8 q10, d22, d28
3016 vmlal.u8 q10, d23, d29
3018 vshll.u16 q0, d16, #8
3019 vmlsl.u16 q0, d16, d30
3020 vmlal.u16 q0, d17, d30
3023 vld1.32 {d16}, [TMP4], STRIDE
3024 vld1.32 {d17}, [TMP4]
3026 vmull.u8 q11, d16, d28
3027 vmlal.u8 q11, d17, d29
3029 vshll.u16 q1, d18, #8
3030 vmlsl.u16 q1, d18, d31
3033 .macro bilinear_interpolate_four_pixels_8888_8888_tail
3034 vmlal.u16 q1, d19, d31
3035 vshr.u16 q15, q12, #8
3036 vshll.u16 q2, d20, #8
3037 vmlsl.u16 q2, d20, d30
3038 vmlal.u16 q2, d21, d30
3039 vshll.u16 q3, d22, #8
3040 vmlsl.u16 q3, d22, d31
3041 vmlal.u16 q3, d23, d31
3042 vadd.u16 q12, q12, q13
3043 vshrn.u32 d0, q0, #16
3044 vshrn.u32 d1, q1, #16
3045 vshrn.u32 d4, q2, #16
3046 vshr.u16 q15, q12, #8
3047 vshrn.u32 d5, q3, #16
3050 vadd.u16 q12, q12, q13
3051 vst1.32 {d6, d7}, [OUT, :128]!
3054 .macro bilinear_interpolate_four_pixels_8888_8888_tail_head
3055 mov TMP1, X, asr #16
3057 add TMP1, TOP, TMP1, asl #2
3058 mov TMP2, X, asr #16
3060 add TMP2, TOP, TMP2, asl #2
3061 vmlal.u16 q1, d19, d31
3062 vshr.u16 q15, q12, #8
3063 vshll.u16 q2, d20, #8
3064 vmlsl.u16 q2, d20, d30
3065 vmlal.u16 q2, d21, d30
3066 vshll.u16 q3, d22, #8
3067 vld1.32 {d20}, [TMP1], STRIDE
3068 vmlsl.u16 q3, d22, d31
3069 vmlal.u16 q3, d23, d31
3070 vld1.32 {d21}, [TMP1]
3071 vmull.u8 q8, d20, d28
3072 vmlal.u8 q8, d21, d29
3073 vshrn.u32 d0, q0, #16
3074 vshrn.u32 d1, q1, #16
3075 vshrn.u32 d4, q2, #16
3076 vld1.32 {d22}, [TMP2], STRIDE
3077 vshrn.u32 d5, q3, #16
3078 vadd.u16 q12, q12, q13
3079 vld1.32 {d23}, [TMP2]
3080 vmull.u8 q9, d22, d28
3081 mov TMP3, X, asr #16
3083 add TMP3, TOP, TMP3, asl #2
3084 mov TMP4, X, asr #16
3086 add TMP4, TOP, TMP4, asl #2
3087 vmlal.u8 q9, d23, d29
3088 vld1.32 {d22}, [TMP3], STRIDE
3089 vshr.u16 q15, q12, #8
3090 vld1.32 {d23}, [TMP3]
3091 vmull.u8 q10, d22, d28
3092 vmlal.u8 q10, d23, d29
3094 vshll.u16 q0, d16, #8
3096 vmlsl.u16 q0, d16, d30
3097 vmlal.u16 q0, d17, d30
3099 vld1.32 {d16}, [TMP4], STRIDE
3100 vadd.u16 q12, q12, q13
3101 vld1.32 {d17}, [TMP4]
3103 vmull.u8 q11, d16, d28
3104 vmlal.u8 q11, d17, d29
3105 vst1.32 {d6, d7}, [OUT, :128]!
3106 vshll.u16 q1, d18, #8
3107 vmlsl.u16 q1, d18, d31
3110 /*****************************************************************************/
3112 .set have_bilinear_interpolate_eight_pixels_8888_0565, 1
3114 .macro bilinear_interpolate_eight_pixels_8888_0565_head
3115 mov TMP1, X, asr #16
3117 add TMP1, TOP, TMP1, asl #2
3118 mov TMP2, X, asr #16
3120 add TMP2, TOP, TMP2, asl #2
3121 vld1.32 {d20}, [TMP1], STRIDE
3122 vld1.32 {d21}, [TMP1]
3123 vmull.u8 q8, d20, d28
3124 vmlal.u8 q8, d21, d29
3125 vld1.32 {d22}, [TMP2], STRIDE
3126 vld1.32 {d23}, [TMP2]
3127 vmull.u8 q9, d22, d28
3128 mov TMP3, X, asr #16
3130 add TMP3, TOP, TMP3, asl #2
3131 mov TMP4, X, asr #16
3133 add TMP4, TOP, TMP4, asl #2
3134 vmlal.u8 q9, d23, d29
3135 vld1.32 {d22}, [TMP3], STRIDE
3136 vld1.32 {d23}, [TMP3]
3137 vmull.u8 q10, d22, d28
3138 vmlal.u8 q10, d23, d29
3139 vshll.u16 q0, d16, #8
3140 vmlsl.u16 q0, d16, d30
3141 vmlal.u16 q0, d17, d30
3143 vld1.32 {d16}, [TMP4], STRIDE
3144 vld1.32 {d17}, [TMP4]
3146 vmull.u8 q11, d16, d28
3147 vmlal.u8 q11, d17, d29
3148 vshll.u16 q1, d18, #8
3149 vmlsl.u16 q1, d18, d31
3151 mov TMP1, X, asr #16
3153 add TMP1, TOP, TMP1, asl #2
3154 mov TMP2, X, asr #16
3156 add TMP2, TOP, TMP2, asl #2
3157 vmlal.u16 q1, d19, d31
3158 vshr.u16 q15, q12, #8
3159 vshll.u16 q2, d20, #8
3160 vmlsl.u16 q2, d20, d30
3161 vmlal.u16 q2, d21, d30
3162 vshll.u16 q3, d22, #8
3163 vld1.32 {d20}, [TMP1], STRIDE
3164 vmlsl.u16 q3, d22, d31
3165 vmlal.u16 q3, d23, d31
3166 vld1.32 {d21}, [TMP1]
3167 vmull.u8 q8, d20, d28
3168 vmlal.u8 q8, d21, d29
3169 vshrn.u32 d0, q0, #16
3170 vshrn.u32 d1, q1, #16
3171 vshrn.u32 d4, q2, #16
3172 vld1.32 {d22}, [TMP2], STRIDE
3173 vshrn.u32 d5, q3, #16
3174 vadd.u16 q12, q12, q13
3175 vld1.32 {d23}, [TMP2]
3176 vmull.u8 q9, d22, d28
3177 mov TMP3, X, asr #16
3179 add TMP3, TOP, TMP3, asl #2
3180 mov TMP4, X, asr #16
3182 add TMP4, TOP, TMP4, asl #2
3183 vmlal.u8 q9, d23, d29
3184 vld1.32 {d22}, [TMP3], STRIDE
3185 vshr.u16 q15, q12, #8
3186 vld1.32 {d23}, [TMP3]
3187 vmull.u8 q10, d22, d28
3188 vmlal.u8 q10, d23, d29
3190 vshll.u16 q0, d16, #8
3192 vmlsl.u16 q0, d16, d30
3193 vmlal.u16 q0, d17, d30
3195 vld1.32 {d16}, [TMP4], STRIDE
3196 vadd.u16 q12, q12, q13
3197 vld1.32 {d17}, [TMP4]
3199 vmull.u8 q11, d16, d28
3200 vmlal.u8 q11, d17, d29
3201 vshll.u16 q1, d18, #8
3202 vmlsl.u16 q1, d18, d31
3205 .macro bilinear_interpolate_eight_pixels_8888_0565_tail
3206 vmlal.u16 q1, d19, d31
3207 vshr.u16 q15, q12, #8
3208 vshll.u16 q2, d20, #8
3209 vmlsl.u16 q2, d20, d30
3210 vmlal.u16 q2, d21, d30
3211 vshll.u16 q3, d22, #8
3212 vmlsl.u16 q3, d22, d31
3213 vmlal.u16 q3, d23, d31
3214 vadd.u16 q12, q12, q13
3215 vshrn.u32 d0, q0, #16
3216 vshrn.u32 d1, q1, #16
3217 vshrn.u32 d4, q2, #16
3218 vshr.u16 q15, q12, #8
3219 vshrn.u32 d5, q3, #16
3222 vadd.u16 q12, q12, q13
3229 vshll.u8 q5, d10, #8
3232 vsri.u16 q5, q7, #11
3233 vst1.32 {d10, d11}, [OUT, :128]!
3236 .macro bilinear_interpolate_eight_pixels_8888_0565_tail_head
3237 mov TMP1, X, asr #16
3239 add TMP1, TOP, TMP1, asl #2
3240 mov TMP2, X, asr #16
3242 add TMP2, TOP, TMP2, asl #2
3243 vmlal.u16 q1, d19, d31
3244 vshr.u16 q15, q12, #8
3246 vshll.u16 q2, d20, #8
3247 vmlsl.u16 q2, d20, d30
3248 vmlal.u16 q2, d21, d30
3249 vshll.u16 q3, d22, #8
3250 vld1.32 {d20}, [TMP1], STRIDE
3251 vmlsl.u16 q3, d22, d31
3252 vmlal.u16 q3, d23, d31
3253 vld1.32 {d21}, [TMP1]
3254 vmull.u8 q8, d20, d28
3255 vmlal.u8 q8, d21, d29
3256 vshrn.u32 d0, q0, #16
3257 vshrn.u32 d1, q1, #16
3258 vshrn.u32 d4, q2, #16
3259 vld1.32 {d22}, [TMP2], STRIDE
3260 vshrn.u32 d5, q3, #16
3261 vadd.u16 q12, q12, q13
3262 vld1.32 {d23}, [TMP2]
3263 vmull.u8 q9, d22, d28
3264 mov TMP3, X, asr #16
3266 add TMP3, TOP, TMP3, asl #2
3267 mov TMP4, X, asr #16
3269 add TMP4, TOP, TMP4, asl #2
3270 vmlal.u8 q9, d23, d29
3271 vld1.32 {d22}, [TMP3], STRIDE
3272 vshr.u16 q15, q12, #8
3273 vld1.32 {d23}, [TMP3]
3274 vmull.u8 q10, d22, d28
3275 vmlal.u8 q10, d23, d29
3277 vshll.u16 q0, d16, #8
3279 vmlsl.u16 q0, d16, d30
3280 vmlal.u16 q0, d17, d30
3282 vld1.32 {d16}, [TMP4], STRIDE
3283 vadd.u16 q12, q12, q13
3284 vld1.32 {d17}, [TMP4]
3286 vmull.u8 q11, d16, d28
3287 vmlal.u8 q11, d17, d29
3289 vshll.u16 q1, d18, #8
3290 vmlsl.u16 q1, d18, d31
3292 mov TMP1, X, asr #16
3294 add TMP1, TOP, TMP1, asl #2
3295 mov TMP2, X, asr #16
3297 add TMP2, TOP, TMP2, asl #2
3298 vmlal.u16 q1, d19, d31
3300 vshr.u16 q15, q12, #8
3301 vshll.u16 q2, d20, #8
3303 vmlsl.u16 q2, d20, d30
3304 vmlal.u16 q2, d21, d30
3305 vshll.u16 q3, d22, #8
3306 vld1.32 {d20}, [TMP1], STRIDE
3307 vmlsl.u16 q3, d22, d31
3308 vmlal.u16 q3, d23, d31
3309 vld1.32 {d21}, [TMP1]
3310 vmull.u8 q8, d20, d28
3311 vmlal.u8 q8, d21, d29
3313 vshll.u8 q5, d10, #8
3315 vshrn.u32 d0, q0, #16
3317 vshrn.u32 d1, q1, #16
3318 vsri.u16 q5, q7, #11
3319 vshrn.u32 d4, q2, #16
3320 vld1.32 {d22}, [TMP2], STRIDE
3321 vshrn.u32 d5, q3, #16
3322 vadd.u16 q12, q12, q13
3323 vld1.32 {d23}, [TMP2]
3324 vmull.u8 q9, d22, d28
3325 mov TMP3, X, asr #16
3327 add TMP3, TOP, TMP3, asl #2
3328 mov TMP4, X, asr #16
3330 add TMP4, TOP, TMP4, asl #2
3331 vmlal.u8 q9, d23, d29
3332 vld1.32 {d22}, [TMP3], STRIDE
3333 vshr.u16 q15, q12, #8
3334 vld1.32 {d23}, [TMP3]
3335 vmull.u8 q10, d22, d28
3336 vmlal.u8 q10, d23, d29
3338 vshll.u16 q0, d16, #8
3340 vmlsl.u16 q0, d16, d30
3341 vmlal.u16 q0, d17, d30
3343 vld1.32 {d16}, [TMP4], STRIDE
3344 vadd.u16 q12, q12, q13
3345 vld1.32 {d17}, [TMP4]
3347 vmull.u8 q11, d16, d28
3348 vmlal.u8 q11, d17, d29
3349 vshll.u16 q1, d18, #8
3350 vst1.32 {d10, d11}, [OUT, :128]!
3351 vmlsl.u16 q1, d18, d31
3353 /*****************************************************************************/
3355 generate_bilinear_scanline_func \
3356 pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
3357 2, 2, 28, BILINEAR_FLAG_UNROLL_4
3359 generate_bilinear_scanline_func \
3360 pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
3361 2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
3363 generate_bilinear_scanline_func \
3364 pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
3365 1, 2, 28, BILINEAR_FLAG_UNROLL_4
3367 generate_bilinear_scanline_func \
3368 pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
3369 1, 1, 28, BILINEAR_FLAG_UNROLL_4