2 * Copyright © 2009 Nokia Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
27 * This file contains implementations of NEON optimized pixel processing
28 * functions. There is no full and detailed tutorial, but some functions
29 * (those which are exposing some new or interesting features) are
30 * extensively commented and can be used as examples.
32 * You may want to have a look at the comments for following functions:
33 * - pixman_composite_over_8888_0565_asm_neon
34 * - pixman_composite_over_n_8_0565_asm_neon
37 /* Prevent the stack from becoming executable for no reason... */
38 #if defined(__linux__) && defined(__ELF__)
39 .section .note.GNU-stack,"",%progbits
46 .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
47 .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
52 #include "pixman-arm-neon-asm.h"
54 /* Global configuration options and preferences */
57 * The code can optionally make use of unaligned memory accesses to improve
58 * performance of handling leading/trailing pixels for each scanline.
59 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
60 * example in linux if unaligned memory accesses are not configured to
61 * generate.exceptions.
63 .set RESPECT_STRICT_ALIGNMENT, 1
66 * Set default prefetch type. There is a choice between the following options:
68 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
69 * as NOP to workaround some HW bugs or for whatever other reason)
71 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
72 * advanced prefetch intruduces heavy overhead)
74 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
75 * which can run ARM and NEON instructions simultaneously so that extra ARM
76 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
78 * Note: some types of function can't support advanced prefetch and fallback
79 * to simple one (those which handle 24bpp pixels)
81 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
83 /* Prefetch distance in pixels for simple prefetch */
84 .set PREFETCH_DISTANCE_SIMPLE, 64
87 * Implementation of pixman_composite_over_8888_0565_asm_neon
89 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
90 * performs OVER compositing operation. Function fast_composite_over_8888_0565
91 * from pixman-fast-path.c does the same in C and can be used as a reference.
93 * First we need to have some NEON assembly code which can do the actual
94 * operation on the pixels and provide it to the template macro.
96 * Template macro quite conveniently takes care of emitting all the necessary
97 * code for memory reading and writing (including quite tricky cases of
98 * handling unaligned leading/trailing pixels), so we only need to deal with
99 * the data in NEON registers.
101 * NEON registers allocation in general is recommented to be the following:
102 * d0, d1, d2, d3 - contain loaded source pixel data
103 * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed)
104 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
105 * d28, d29, d30, d31 - place for storing the result (destination pixels)
107 * As can be seen above, four 64-bit NEON registers are used for keeping
108 * intermediate pixel data and up to 8 pixels can be processed in one step
109 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
111 * This particular function uses the following registers allocation:
112 * d0, d1, d2, d3 - contain loaded source pixel data
113 * d4, d5 - contain loaded destination pixels (they are needed)
114 * d28, d29 - place for storing the result (destination pixels)
118 * Step one. We need to have some code to do some arithmetics on pixel data.
119 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
120 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
121 * perform all the needed calculations and write the result to {d28, d29}.
122 * The rationale for having two macros and not just one will be explained
123 * later. In practice, any single monolitic function which does the work can
124 * be split into two parts in any arbitrary way without affecting correctness.
126 * There is one special trick here too. Common template macro can optionally
127 * make our life a bit easier by doing R, G, B, A color components
128 * deinterleaving for 32bpp pixel formats (and this feature is used in
129 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
130 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
131 * actually use d0 register for blue channel (a vector of eight 8-bit
132 * values), d1 register for green, d2 for red and d3 for alpha. This
133 * simple conversion can be also done with a few NEON instructions:
135 * Packed to planar conversion:
141 * Planar to packed conversion:
147 * But pixel can be loaded directly in planar format using VLD4.8 NEON
148 * instruction. It is 1 cycle slower than VLD1.32, so this is not always
149 * desirable, that's why deinterleaving is optional.
151 * But anyway, here is the code:
153 .macro pixman_composite_over_8888_0565_process_pixblock_head
154 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
155 and put data into d6 - red, d7 - green, d30 - blue */
160 vmvn.8 d3, d3 /* invert source alpha */
162 vshrn.u16 d30, q2, #2
163 /* now do alpha blending, storing results in 8-bit planar format
164 into d16 - red, d19 - green, d18 - blue */
167 vmull.u8 q12, d3, d30
168 vrshr.u16 q13, q10, #8
169 vrshr.u16 q3, q11, #8
170 vrshr.u16 q15, q12, #8
171 vraddhn.u16 d20, q10, q13
172 vraddhn.u16 d23, q11, q3
173 vraddhn.u16 d22, q12, q15
176 .macro pixman_composite_over_8888_0565_process_pixblock_tail
177 /* ... continue alpha blending */
178 vqadd.u8 d16, d2, d20
180 /* convert the result to r5g6b5 and store it into {d28, d29} */
181 vshll.u8 q14, d16, #8
185 vsri.u16 q14, q9, #11
189 * OK, now we got almost everything that we need. Using the above two
190 * macros, the work can be done right. But now we want to optimize
191 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
192 * a lot from good code scheduling and software pipelining.
194 * Let's construct some code, which will run in the core main loop.
195 * Some pseudo-code of the main loop will look like this:
203 * It may look a bit weird, but this setup allows to hide instruction
204 * latencies better and also utilize dual-issue capability more
205 * efficiently (make pairs of load-store and ALU instructions).
207 * So what we need now is a '*_tail_head' macro, which will be used
208 * in the core main loop. A trivial straightforward implementation
209 * of this macro would look like this:
211 * pixman_composite_over_8888_0565_process_pixblock_tail
212 * vst1.16 {d28, d29}, [DST_W, :128]!
213 * vld1.16 {d4, d5}, [DST_R, :128]!
214 * vld4.32 {d0, d1, d2, d3}, [SRC]!
215 * pixman_composite_over_8888_0565_process_pixblock_head
218 * Now it also got some VLD/VST instructions. We simply can't move from
219 * processing one block of pixels to the other one with just arithmetics.
220 * The previously processed data needs to be written to memory and new
221 * data needs to be fetched. Fortunately, this main loop does not deal
222 * with partial leading/trailing pixels and can load/store a full block
223 * of pixels in a bulk. Additionally, destination buffer is already
224 * 16 bytes aligned here (which is good for performance).
226 * New things here are DST_R, DST_W, SRC and MASK identifiers. These
227 * are the aliases for ARM registers which are used as pointers for
228 * accessing data. We maintain separate pointers for reading and writing
229 * destination buffer (DST_R and DST_W).
231 * Another new thing is 'cache_preload' macro. It is used for prefetching
232 * data into CPU L2 cache and improve performance when dealing with large
233 * images which are far larger than cache size. It uses one argument
234 * (actually two, but they need to be the same here) - number of pixels
235 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
236 * details about this macro. Moreover, if good performance is needed
237 * the code from this macro needs to be copied into '*_tail_head' macro
238 * and mixed with the rest of code for optimal instructions scheduling.
239 * We are actually doing it below.
241 * Now after all the explanations, here is the optimized code.
242 * Different instruction streams (originaling from '*_head', '*_tail'
243 * and 'cache_preload' macro) use different indentation levels for
244 * better readability. Actually taking the code from one of these
245 * indentation levels and ignoring a few VLD/VST instructions would
246 * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
252 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
253 vqadd.u8 d16, d2, d20
254 vld1.16 {d4, d5}, [DST_R, :128]!
260 vshll.u8 q14, d16, #8
261 PF add PF_X, PF_X, #8
265 PF addne PF_X, PF_X, #8
267 PF subne PF_CTL, PF_CTL, #1
269 vshrn.u16 d30, q2, #2
271 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
273 vmull.u8 q12, d3, d30
274 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
278 vrshr.u16 q13, q10, #8
279 PF subge PF_X, PF_X, ORIG_W
280 vrshr.u16 q3, q11, #8
281 vrshr.u16 q15, q12, #8
282 PF subges PF_CTL, PF_CTL, #0x10
283 vsri.u16 q14, q9, #11
284 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
285 vraddhn.u16 d20, q10, q13
286 vraddhn.u16 d23, q11, q3
287 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
288 vraddhn.u16 d22, q12, q15
289 vst1.16 {d28, d29}, [DST_W, :128]!
294 /* If we did not care much about the performance, we would just use this... */
295 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
296 pixman_composite_over_8888_0565_process_pixblock_tail
297 vst1.16 {d28, d29}, [DST_W, :128]!
298 vld1.16 {d4, d5}, [DST_R, :128]!
300 pixman_composite_over_8888_0565_process_pixblock_head
307 * And now the final part. We are using 'generate_composite_function' macro
308 * to put all the stuff together. We are specifying the name of the function
309 * which we want to get, number of bits per pixel for the source, mask and
310 * destination (0 if unused, like mask in this case). Next come some bit
312 * FLAG_DST_READWRITE - tells that the destination buffer is both read
313 * and written, for write-only buffer we would use
314 * FLAG_DST_WRITEONLY flag instead
315 * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
316 * and separate color channels for 32bpp format.
317 * The next things are:
318 * - the number of pixels processed per iteration (8 in this case, because
319 * that's the maximum what can fit into four 64-bit NEON registers).
320 * - prefetch distance, measured in pixel blocks. In this case it is 5 times
321 * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
322 * prefetch distance can be selected by running some benchmarks.
324 * After that we specify some macros, these are 'default_init',
325 * 'default_cleanup' here which are empty (but it is possible to have custom
326 * init/cleanup macros to be able to save/restore some extra NEON registers
327 * like d8-d15 or do anything else) followed by
328 * 'pixman_composite_over_8888_0565_process_pixblock_head',
329 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
330 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
331 * which we got implemented above.
333 * The last part is the NEON registers allocation scheme.
335 generate_composite_function \
336 pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
337 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
338 8, /* number of pixels, processed in a single block */ \
339 5, /* prefetch distance */ \
342 pixman_composite_over_8888_0565_process_pixblock_head, \
343 pixman_composite_over_8888_0565_process_pixblock_tail, \
344 pixman_composite_over_8888_0565_process_pixblock_tail_head, \
345 28, /* dst_w_basereg */ \
346 4, /* dst_r_basereg */ \
347 0, /* src_basereg */ \
348 24 /* mask_basereg */
350 /******************************************************************************/
352 .macro pixman_composite_over_n_0565_process_pixblock_head
353 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
354 and put data into d6 - red, d7 - green, d30 - blue */
360 vshrn.u16 d30, q2, #2
361 /* now do alpha blending, storing results in 8-bit planar format
362 into d16 - red, d19 - green, d18 - blue */
365 vmull.u8 q12, d3, d30
366 vrshr.u16 q13, q10, #8
367 vrshr.u16 q3, q11, #8
368 vrshr.u16 q15, q12, #8
369 vraddhn.u16 d20, q10, q13
370 vraddhn.u16 d23, q11, q3
371 vraddhn.u16 d22, q12, q15
374 .macro pixman_composite_over_n_0565_process_pixblock_tail
375 /* ... continue alpha blending */
376 vqadd.u8 d16, d2, d20
378 /* convert the result to r5g6b5 and store it into {d28, d29} */
379 vshll.u8 q14, d16, #8
383 vsri.u16 q14, q9, #11
386 /* TODO: expand macros and do better instructions scheduling */
387 .macro pixman_composite_over_n_0565_process_pixblock_tail_head
388 pixman_composite_over_n_0565_process_pixblock_tail
389 vld1.16 {d4, d5}, [DST_R, :128]!
390 vst1.16 {d28, d29}, [DST_W, :128]!
391 pixman_composite_over_n_0565_process_pixblock_head
395 .macro pixman_composite_over_n_0565_init
396 add DUMMY, sp, #ARGS_STACK_OFFSET
397 vld1.32 {d3[0]}, [DUMMY]
402 vmvn.8 d3, d3 /* invert source alpha */
405 generate_composite_function \
406 pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
407 FLAG_DST_READWRITE, \
408 8, /* number of pixels, processed in a single block */ \
409 5, /* prefetch distance */ \
410 pixman_composite_over_n_0565_init, \
412 pixman_composite_over_n_0565_process_pixblock_head, \
413 pixman_composite_over_n_0565_process_pixblock_tail, \
414 pixman_composite_over_n_0565_process_pixblock_tail_head, \
415 28, /* dst_w_basereg */ \
416 4, /* dst_r_basereg */ \
417 0, /* src_basereg */ \
418 24 /* mask_basereg */
420 /******************************************************************************/
422 .macro pixman_composite_src_8888_0565_process_pixblock_head
428 .macro pixman_composite_src_8888_0565_process_pixblock_tail
430 vsri.u16 q14, q9, #11
433 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
435 PF add PF_X, PF_X, #8
438 PF addne PF_X, PF_X, #8
439 PF subne PF_CTL, PF_CTL, #1
440 vsri.u16 q14, q9, #11
442 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
444 vst1.16 {d28, d29}, [DST_W, :128]!
445 PF subge PF_X, PF_X, ORIG_W
446 PF subges PF_CTL, PF_CTL, #0x10
448 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
452 generate_composite_function \
453 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
454 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
455 8, /* number of pixels, processed in a single block */ \
456 10, /* prefetch distance */ \
459 pixman_composite_src_8888_0565_process_pixblock_head, \
460 pixman_composite_src_8888_0565_process_pixblock_tail, \
461 pixman_composite_src_8888_0565_process_pixblock_tail_head
463 /******************************************************************************/
465 .macro pixman_composite_src_0565_8888_process_pixblock_head
466 vshrn.u16 d30, q0, #8
467 vshrn.u16 d29, q0, #3
472 vshrn.u16 d28, q0, #2
475 .macro pixman_composite_src_0565_8888_process_pixblock_tail
478 /* TODO: expand macros and do better instructions scheduling */
479 .macro pixman_composite_src_0565_8888_process_pixblock_tail_head
480 pixman_composite_src_0565_8888_process_pixblock_tail
481 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
483 pixman_composite_src_0565_8888_process_pixblock_head
487 generate_composite_function \
488 pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
489 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
490 8, /* number of pixels, processed in a single block */ \
491 10, /* prefetch distance */ \
494 pixman_composite_src_0565_8888_process_pixblock_head, \
495 pixman_composite_src_0565_8888_process_pixblock_tail, \
496 pixman_composite_src_0565_8888_process_pixblock_tail_head
498 /******************************************************************************/
500 .macro pixman_composite_add_8_8_process_pixblock_head
505 .macro pixman_composite_add_8_8_process_pixblock_tail
508 .macro pixman_composite_add_8_8_process_pixblock_tail_head
510 PF add PF_X, PF_X, #32
512 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
513 PF addne PF_X, PF_X, #32
514 PF subne PF_CTL, PF_CTL, #1
515 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
517 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
518 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
519 PF subge PF_X, PF_X, ORIG_W
520 PF subges PF_CTL, PF_CTL, #0x10
522 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
523 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
527 generate_composite_function \
528 pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
529 FLAG_DST_READWRITE, \
530 32, /* number of pixels, processed in a single block */ \
531 10, /* prefetch distance */ \
534 pixman_composite_add_8_8_process_pixblock_head, \
535 pixman_composite_add_8_8_process_pixblock_tail, \
536 pixman_composite_add_8_8_process_pixblock_tail_head
538 /******************************************************************************/
540 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
542 PF add PF_X, PF_X, #8
544 vld1.32 {d4, d5, d6, d7}, [DST_R, :128]!
545 PF addne PF_X, PF_X, #8
546 PF subne PF_CTL, PF_CTL, #1
547 vst1.32 {d28, d29, d30, d31}, [DST_W, :128]!
549 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
550 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
551 PF subge PF_X, PF_X, ORIG_W
552 PF subges PF_CTL, PF_CTL, #0x10
554 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
555 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
559 generate_composite_function \
560 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
561 FLAG_DST_READWRITE, \
562 8, /* number of pixels, processed in a single block */ \
563 10, /* prefetch distance */ \
566 pixman_composite_add_8_8_process_pixblock_head, \
567 pixman_composite_add_8_8_process_pixblock_tail, \
568 pixman_composite_add_8888_8888_process_pixblock_tail_head
570 generate_composite_function_single_scanline \
571 pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
572 FLAG_DST_READWRITE, \
573 8, /* number of pixels, processed in a single block */ \
576 pixman_composite_add_8_8_process_pixblock_head, \
577 pixman_composite_add_8_8_process_pixblock_tail, \
578 pixman_composite_add_8888_8888_process_pixblock_tail_head
580 /******************************************************************************/
582 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
583 vmvn.8 d24, d3 /* get inverted alpha */
584 /* do alpha blending */
587 vmull.u8 q10, d24, d6
588 vmull.u8 q11, d24, d7
591 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
592 vrshr.u16 q14, q8, #8
593 vrshr.u16 q15, q9, #8
594 vrshr.u16 q12, q10, #8
595 vrshr.u16 q13, q11, #8
596 vraddhn.u16 d28, q14, q8
597 vraddhn.u16 d29, q15, q9
598 vraddhn.u16 d30, q12, q10
599 vraddhn.u16 d31, q13, q11
602 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
603 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
604 vrshr.u16 q14, q8, #8
605 PF add PF_X, PF_X, #8
607 vrshr.u16 q15, q9, #8
608 vrshr.u16 q12, q10, #8
609 vrshr.u16 q13, q11, #8
610 PF addne PF_X, PF_X, #8
611 PF subne PF_CTL, PF_CTL, #1
612 vraddhn.u16 d28, q14, q8
613 vraddhn.u16 d29, q15, q9
615 vraddhn.u16 d30, q12, q10
616 vraddhn.u16 d31, q13, q11
618 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
620 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
621 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
622 PF subge PF_X, PF_X, ORIG_W
624 PF subges PF_CTL, PF_CTL, #0x10
626 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
627 vmull.u8 q10, d22, d6
628 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
629 vmull.u8 q11, d22, d7
632 generate_composite_function_single_scanline \
633 pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
634 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
635 8, /* number of pixels, processed in a single block */ \
638 pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
639 pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
640 pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
642 /******************************************************************************/
644 .macro pixman_composite_over_8888_8888_process_pixblock_head
645 pixman_composite_out_reverse_8888_8888_process_pixblock_head
648 .macro pixman_composite_over_8888_8888_process_pixblock_tail
649 pixman_composite_out_reverse_8888_8888_process_pixblock_tail
650 vqadd.u8 q14, q0, q14
651 vqadd.u8 q15, q1, q15
654 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
655 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
656 vrshr.u16 q14, q8, #8
657 PF add PF_X, PF_X, #8
659 vrshr.u16 q15, q9, #8
660 vrshr.u16 q12, q10, #8
661 vrshr.u16 q13, q11, #8
662 PF addne PF_X, PF_X, #8
663 PF subne PF_CTL, PF_CTL, #1
664 vraddhn.u16 d28, q14, q8
665 vraddhn.u16 d29, q15, q9
667 vraddhn.u16 d30, q12, q10
668 vraddhn.u16 d31, q13, q11
669 vqadd.u8 q14, q0, q14
670 vqadd.u8 q15, q1, q15
672 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
674 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
675 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
676 PF subge PF_X, PF_X, ORIG_W
678 PF subges PF_CTL, PF_CTL, #0x10
680 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
681 vmull.u8 q10, d22, d6
682 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
683 vmull.u8 q11, d22, d7
686 generate_composite_function \
687 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
688 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
689 8, /* number of pixels, processed in a single block */ \
690 5, /* prefetch distance */ \
693 pixman_composite_over_8888_8888_process_pixblock_head, \
694 pixman_composite_over_8888_8888_process_pixblock_tail, \
695 pixman_composite_over_8888_8888_process_pixblock_tail_head
697 generate_composite_function_single_scanline \
698 pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
699 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
700 8, /* number of pixels, processed in a single block */ \
703 pixman_composite_over_8888_8888_process_pixblock_head, \
704 pixman_composite_over_8888_8888_process_pixblock_tail, \
705 pixman_composite_over_8888_8888_process_pixblock_tail_head
707 /******************************************************************************/
709 .macro pixman_composite_over_n_8888_process_pixblock_head
710 /* deinterleaved source pixels in {d0, d1, d2, d3} */
711 /* inverted alpha in {d24} */
712 /* destination pixels in {d4, d5, d6, d7} */
715 vmull.u8 q10, d24, d6
716 vmull.u8 q11, d24, d7
719 .macro pixman_composite_over_n_8888_process_pixblock_tail
720 vrshr.u16 q14, q8, #8
721 vrshr.u16 q15, q9, #8
722 vrshr.u16 q2, q10, #8
723 vrshr.u16 q3, q11, #8
724 vraddhn.u16 d28, q14, q8
725 vraddhn.u16 d29, q15, q9
726 vraddhn.u16 d30, q2, q10
727 vraddhn.u16 d31, q3, q11
728 vqadd.u8 q14, q0, q14
729 vqadd.u8 q15, q1, q15
732 .macro pixman_composite_over_n_8888_process_pixblock_tail_head
733 vrshr.u16 q14, q8, #8
734 vrshr.u16 q15, q9, #8
735 vrshr.u16 q2, q10, #8
736 vrshr.u16 q3, q11, #8
737 vraddhn.u16 d28, q14, q8
738 vraddhn.u16 d29, q15, q9
739 vraddhn.u16 d30, q2, q10
740 vraddhn.u16 d31, q3, q11
741 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
742 vqadd.u8 q14, q0, q14
743 PF add PF_X, PF_X, #8
745 PF addne PF_X, PF_X, #8
746 PF subne PF_CTL, PF_CTL, #1
747 vqadd.u8 q15, q1, q15
750 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
752 PF subge PF_X, PF_X, ORIG_W
753 vmull.u8 q10, d24, d6
754 PF subges PF_CTL, PF_CTL, #0x10
755 vmull.u8 q11, d24, d7
756 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
757 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
760 .macro pixman_composite_over_n_8888_init
761 add DUMMY, sp, #ARGS_STACK_OFFSET
762 vld1.32 {d3[0]}, [DUMMY]
767 vmvn.8 d24, d3 /* get inverted alpha */
770 generate_composite_function \
771 pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
772 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
773 8, /* number of pixels, processed in a single block */ \
774 5, /* prefetch distance */ \
775 pixman_composite_over_n_8888_init, \
777 pixman_composite_over_8888_8888_process_pixblock_head, \
778 pixman_composite_over_8888_8888_process_pixblock_tail, \
779 pixman_composite_over_n_8888_process_pixblock_tail_head
781 /******************************************************************************/
783 .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
784 vrshr.u16 q14, q8, #8
785 PF add PF_X, PF_X, #8
787 vrshr.u16 q15, q9, #8
788 vrshr.u16 q12, q10, #8
789 vrshr.u16 q13, q11, #8
790 PF addne PF_X, PF_X, #8
791 PF subne PF_CTL, PF_CTL, #1
792 vraddhn.u16 d28, q14, q8
793 vraddhn.u16 d29, q15, q9
795 vraddhn.u16 d30, q12, q10
796 vraddhn.u16 d31, q13, q11
797 vqadd.u8 q14, q0, q14
798 vqadd.u8 q15, q1, q15
799 vld4.8 {d0, d1, d2, d3}, [DST_R, :128]!
801 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
802 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
803 PF subge PF_X, PF_X, ORIG_W
805 PF subges PF_CTL, PF_CTL, #0x10
807 vmull.u8 q10, d22, d6
808 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
809 vmull.u8 q11, d22, d7
812 .macro pixman_composite_over_reverse_n_8888_init
813 add DUMMY, sp, #ARGS_STACK_OFFSET
814 vld1.32 {d7[0]}, [DUMMY]
821 generate_composite_function \
822 pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
823 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
824 8, /* number of pixels, processed in a single block */ \
825 5, /* prefetch distance */ \
826 pixman_composite_over_reverse_n_8888_init, \
828 pixman_composite_over_8888_8888_process_pixblock_head, \
829 pixman_composite_over_8888_8888_process_pixblock_tail, \
830 pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
831 28, /* dst_w_basereg */ \
832 0, /* dst_r_basereg */ \
833 4, /* src_basereg */ \
834 24 /* mask_basereg */
836 /******************************************************************************/
838 .macro pixman_composite_over_8888_8_0565_process_pixblock_head
839 vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */
841 vmull.u8 q6, d24, d10
842 vmull.u8 q7, d24, d11
843 vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */
846 vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */
848 vrshr.u16 q10, q6, #8
849 vrshr.u16 q11, q7, #8
850 vraddhn.u16 d0, q0, q8
851 vraddhn.u16 d1, q1, q9
852 vraddhn.u16 d2, q6, q10
853 vraddhn.u16 d3, q7, q11
854 vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */
857 vshrn.u16 d30, q2, #2
858 vmull.u8 q8, d3, d6 /* now do alpha blending */
860 vmull.u8 q10, d3, d30
863 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail
864 /* 3 cycle bubble (after vmull.u8) */
865 vrshr.u16 q13, q8, #8
866 vrshr.u16 q11, q9, #8
867 vrshr.u16 q15, q10, #8
868 vraddhn.u16 d16, q8, q13
869 vraddhn.u16 d27, q9, q11
870 vraddhn.u16 d26, q10, q15
871 vqadd.u8 d16, d2, d16
874 vshll.u8 q14, d16, #8 /* convert to 16bpp */
879 vsri.u16 q14, q9, #11
882 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
883 vld1.16 {d4, d5}, [DST_R, :128]!
888 vmull.u8 q6, d24, d10
889 vrshr.u16 q13, q8, #8
890 vrshr.u16 q11, q9, #8
891 vrshr.u16 q15, q10, #8
892 vraddhn.u16 d16, q8, q13
893 vraddhn.u16 d27, q9, q11
894 vraddhn.u16 d26, q10, q15
895 vqadd.u8 d16, d2, d16
898 vshll.u8 q14, d16, #8
903 vmull.u8 q7, d24, d11
904 vsri.u16 q14, q9, #11
911 vrshr.u16 q10, q6, #8
912 vrshr.u16 q11, q7, #8
913 vraddhn.u16 d0, q0, q8
914 vraddhn.u16 d1, q1, q9
915 vraddhn.u16 d2, q6, q10
916 vraddhn.u16 d3, q7, q11
920 vshrn.u16 d30, q2, #2
921 vst1.16 {d28, d29}, [DST_W, :128]!
924 vmull.u8 q10, d3, d30
927 generate_composite_function \
928 pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
929 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
930 8, /* number of pixels, processed in a single block */ \
931 5, /* prefetch distance */ \
932 default_init_need_all_regs, \
933 default_cleanup_need_all_regs, \
934 pixman_composite_over_8888_8_0565_process_pixblock_head, \
935 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
936 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
937 28, /* dst_w_basereg */ \
938 4, /* dst_r_basereg */ \
939 8, /* src_basereg */ \
940 24 /* mask_basereg */
942 /******************************************************************************/
945 * This function needs a special initialization of solid mask.
946 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
947 * offset, split into color components and replicated in d8-d11
948 * registers. Additionally, this function needs all the NEON registers,
949 * so it has to save d8-d15 registers which are callee saved according
950 * to ABI. These registers are restored from 'cleanup' macro. All the
951 * other NEON registers are caller saved, so can be clobbered freely
952 * without introducing any problems.
954 .macro pixman_composite_over_n_8_0565_init
955 add DUMMY, sp, #ARGS_STACK_OFFSET
957 vld1.32 {d11[0]}, [DUMMY]
964 .macro pixman_composite_over_n_8_0565_cleanup
968 generate_composite_function \
969 pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
970 FLAG_DST_READWRITE, \
971 8, /* number of pixels, processed in a single block */ \
972 5, /* prefetch distance */ \
973 pixman_composite_over_n_8_0565_init, \
974 pixman_composite_over_n_8_0565_cleanup, \
975 pixman_composite_over_8888_8_0565_process_pixblock_head, \
976 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
977 pixman_composite_over_8888_8_0565_process_pixblock_tail_head
979 /******************************************************************************/
981 .macro pixman_composite_over_8888_n_0565_init
982 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
984 vld1.32 {d24[0]}, [DUMMY]
988 .macro pixman_composite_over_8888_n_0565_cleanup
992 generate_composite_function \
993 pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
994 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
995 8, /* number of pixels, processed in a single block */ \
996 5, /* prefetch distance */ \
997 pixman_composite_over_8888_n_0565_init, \
998 pixman_composite_over_8888_n_0565_cleanup, \
999 pixman_composite_over_8888_8_0565_process_pixblock_head, \
1000 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
1001 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
1002 28, /* dst_w_basereg */ \
1003 4, /* dst_r_basereg */ \
1004 8, /* src_basereg */ \
1005 24 /* mask_basereg */
1007 /******************************************************************************/
1009 .macro pixman_composite_src_0565_0565_process_pixblock_head
1012 .macro pixman_composite_src_0565_0565_process_pixblock_tail
1015 .macro pixman_composite_src_0565_0565_process_pixblock_tail_head
1016 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
1018 cache_preload 16, 16
1021 generate_composite_function \
1022 pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
1023 FLAG_DST_WRITEONLY, \
1024 16, /* number of pixels, processed in a single block */ \
1025 10, /* prefetch distance */ \
1028 pixman_composite_src_0565_0565_process_pixblock_head, \
1029 pixman_composite_src_0565_0565_process_pixblock_tail, \
1030 pixman_composite_src_0565_0565_process_pixblock_tail_head, \
1031 0, /* dst_w_basereg */ \
1032 0, /* dst_r_basereg */ \
1033 0, /* src_basereg */ \
1034 0 /* mask_basereg */
1036 /******************************************************************************/
1038 .macro pixman_composite_src_n_8_process_pixblock_head
1041 .macro pixman_composite_src_n_8_process_pixblock_tail
1044 .macro pixman_composite_src_n_8_process_pixblock_tail_head
1045 vst1.8 {d0, d1, d2, d3}, [DST_W, :128]!
1048 .macro pixman_composite_src_n_8_init
1049 add DUMMY, sp, #ARGS_STACK_OFFSET
1050 vld1.32 {d0[0]}, [DUMMY]
1052 vsli.u64 d0, d0, #16
1053 vsli.u64 d0, d0, #32
1058 .macro pixman_composite_src_n_8_cleanup
1061 generate_composite_function \
1062 pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
1063 FLAG_DST_WRITEONLY, \
1064 32, /* number of pixels, processed in a single block */ \
1065 0, /* prefetch distance */ \
1066 pixman_composite_src_n_8_init, \
1067 pixman_composite_src_n_8_cleanup, \
1068 pixman_composite_src_n_8_process_pixblock_head, \
1069 pixman_composite_src_n_8_process_pixblock_tail, \
1070 pixman_composite_src_n_8_process_pixblock_tail_head, \
1071 0, /* dst_w_basereg */ \
1072 0, /* dst_r_basereg */ \
1073 0, /* src_basereg */ \
1074 0 /* mask_basereg */
1076 /******************************************************************************/
1078 .macro pixman_composite_src_n_0565_process_pixblock_head
1081 .macro pixman_composite_src_n_0565_process_pixblock_tail
1084 .macro pixman_composite_src_n_0565_process_pixblock_tail_head
1085 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
1088 .macro pixman_composite_src_n_0565_init
1089 add DUMMY, sp, #ARGS_STACK_OFFSET
1090 vld1.32 {d0[0]}, [DUMMY]
1091 vsli.u64 d0, d0, #16
1092 vsli.u64 d0, d0, #32
1097 .macro pixman_composite_src_n_0565_cleanup
1100 generate_composite_function \
1101 pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
1102 FLAG_DST_WRITEONLY, \
1103 16, /* number of pixels, processed in a single block */ \
1104 0, /* prefetch distance */ \
1105 pixman_composite_src_n_0565_init, \
1106 pixman_composite_src_n_0565_cleanup, \
1107 pixman_composite_src_n_0565_process_pixblock_head, \
1108 pixman_composite_src_n_0565_process_pixblock_tail, \
1109 pixman_composite_src_n_0565_process_pixblock_tail_head, \
1110 0, /* dst_w_basereg */ \
1111 0, /* dst_r_basereg */ \
1112 0, /* src_basereg */ \
1113 0 /* mask_basereg */
1115 /******************************************************************************/
1117 .macro pixman_composite_src_n_8888_process_pixblock_head
1120 .macro pixman_composite_src_n_8888_process_pixblock_tail
1123 .macro pixman_composite_src_n_8888_process_pixblock_tail_head
1124 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1127 .macro pixman_composite_src_n_8888_init
1128 add DUMMY, sp, #ARGS_STACK_OFFSET
1129 vld1.32 {d0[0]}, [DUMMY]
1130 vsli.u64 d0, d0, #32
1135 .macro pixman_composite_src_n_8888_cleanup
1138 generate_composite_function \
1139 pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
1140 FLAG_DST_WRITEONLY, \
1141 8, /* number of pixels, processed in a single block */ \
1142 0, /* prefetch distance */ \
1143 pixman_composite_src_n_8888_init, \
1144 pixman_composite_src_n_8888_cleanup, \
1145 pixman_composite_src_n_8888_process_pixblock_head, \
1146 pixman_composite_src_n_8888_process_pixblock_tail, \
1147 pixman_composite_src_n_8888_process_pixblock_tail_head, \
1148 0, /* dst_w_basereg */ \
1149 0, /* dst_r_basereg */ \
1150 0, /* src_basereg */ \
1151 0 /* mask_basereg */
1153 /******************************************************************************/
1155 .macro pixman_composite_src_8888_8888_process_pixblock_head
1158 .macro pixman_composite_src_8888_8888_process_pixblock_tail
1161 .macro pixman_composite_src_8888_8888_process_pixblock_tail_head
1162 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1167 generate_composite_function \
1168 pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
1169 FLAG_DST_WRITEONLY, \
1170 8, /* number of pixels, processed in a single block */ \
1171 10, /* prefetch distance */ \
1174 pixman_composite_src_8888_8888_process_pixblock_head, \
1175 pixman_composite_src_8888_8888_process_pixblock_tail, \
1176 pixman_composite_src_8888_8888_process_pixblock_tail_head, \
1177 0, /* dst_w_basereg */ \
1178 0, /* dst_r_basereg */ \
1179 0, /* src_basereg */ \
1180 0 /* mask_basereg */
1182 /******************************************************************************/
1184 .macro pixman_composite_src_x888_8888_process_pixblock_head
1189 .macro pixman_composite_src_x888_8888_process_pixblock_tail
1192 .macro pixman_composite_src_x888_8888_process_pixblock_tail_head
1193 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1200 .macro pixman_composite_src_x888_8888_init
1202 vshl.u32 q2, q2, #24
1205 generate_composite_function \
1206 pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
1207 FLAG_DST_WRITEONLY, \
1208 8, /* number of pixels, processed in a single block */ \
1209 10, /* prefetch distance */ \
1210 pixman_composite_src_x888_8888_init, \
1212 pixman_composite_src_x888_8888_process_pixblock_head, \
1213 pixman_composite_src_x888_8888_process_pixblock_tail, \
1214 pixman_composite_src_x888_8888_process_pixblock_tail_head, \
1215 0, /* dst_w_basereg */ \
1216 0, /* dst_r_basereg */ \
1217 0, /* src_basereg */ \
1218 0 /* mask_basereg */
1220 /******************************************************************************/
1222 .macro pixman_composite_over_n_8_8888_process_pixblock_head
1223 /* expecting deinterleaved source data in {d8, d9, d10, d11} */
1224 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1225 /* and destination data in {d4, d5, d6, d7} */
1226 /* mask is in d24 (d25, d26, d27 are unused) */
1229 vmull.u8 q6, d24, d8
1230 vmull.u8 q7, d24, d9
1231 vmull.u8 q8, d24, d10
1232 vmull.u8 q9, d24, d11
1233 vrshr.u16 q10, q6, #8
1234 vrshr.u16 q11, q7, #8
1235 vrshr.u16 q12, q8, #8
1236 vrshr.u16 q13, q9, #8
1237 vraddhn.u16 d0, q6, q10
1238 vraddhn.u16 d1, q7, q11
1239 vraddhn.u16 d2, q8, q12
1240 vraddhn.u16 d3, q9, q13
1241 vmvn.8 d25, d3 /* get inverted alpha */
1242 /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */
1243 /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
1244 /* now do alpha blending */
1245 vmull.u8 q8, d25, d4
1246 vmull.u8 q9, d25, d5
1247 vmull.u8 q10, d25, d6
1248 vmull.u8 q11, d25, d7
1251 .macro pixman_composite_over_n_8_8888_process_pixblock_tail
1252 vrshr.u16 q14, q8, #8
1253 vrshr.u16 q15, q9, #8
1254 vrshr.u16 q6, q10, #8
1255 vrshr.u16 q7, q11, #8
1256 vraddhn.u16 d28, q14, q8
1257 vraddhn.u16 d29, q15, q9
1258 vraddhn.u16 d30, q6, q10
1259 vraddhn.u16 d31, q7, q11
1260 vqadd.u8 q14, q0, q14
1261 vqadd.u8 q15, q1, q15
1264 .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
1265 vrshr.u16 q14, q8, #8
1266 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1267 vrshr.u16 q15, q9, #8
1269 vrshr.u16 q6, q10, #8
1270 PF add PF_X, PF_X, #8
1271 vrshr.u16 q7, q11, #8
1272 PF tst PF_CTL, #0x0F
1273 vraddhn.u16 d28, q14, q8
1274 PF addne PF_X, PF_X, #8
1275 vraddhn.u16 d29, q15, q9
1276 PF subne PF_CTL, PF_CTL, #1
1277 vraddhn.u16 d30, q6, q10
1279 vraddhn.u16 d31, q7, q11
1280 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1281 vmull.u8 q6, d24, d8
1282 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1283 vmull.u8 q7, d24, d9
1284 PF subge PF_X, PF_X, ORIG_W
1285 vmull.u8 q8, d24, d10
1286 PF subges PF_CTL, PF_CTL, #0x10
1287 vmull.u8 q9, d24, d11
1288 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1289 vqadd.u8 q14, q0, q14
1290 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1291 vqadd.u8 q15, q1, q15
1292 vrshr.u16 q10, q6, #8
1293 vrshr.u16 q11, q7, #8
1294 vrshr.u16 q12, q8, #8
1295 vrshr.u16 q13, q9, #8
1296 vraddhn.u16 d0, q6, q10
1297 vraddhn.u16 d1, q7, q11
1298 vraddhn.u16 d2, q8, q12
1299 vraddhn.u16 d3, q9, q13
1300 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1302 vmull.u8 q8, d25, d4
1303 vmull.u8 q9, d25, d5
1304 vmull.u8 q10, d25, d6
1305 vmull.u8 q11, d25, d7
1308 .macro pixman_composite_over_n_8_8888_init
1309 add DUMMY, sp, #ARGS_STACK_OFFSET
1311 vld1.32 {d11[0]}, [DUMMY]
1318 .macro pixman_composite_over_n_8_8888_cleanup
1322 generate_composite_function \
1323 pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
1324 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1325 8, /* number of pixels, processed in a single block */ \
1326 5, /* prefetch distance */ \
1327 pixman_composite_over_n_8_8888_init, \
1328 pixman_composite_over_n_8_8888_cleanup, \
1329 pixman_composite_over_n_8_8888_process_pixblock_head, \
1330 pixman_composite_over_n_8_8888_process_pixblock_tail, \
1331 pixman_composite_over_n_8_8888_process_pixblock_tail_head
1333 /******************************************************************************/
1335 .macro pixman_composite_over_n_8_8_process_pixblock_head
1336 vmull.u8 q0, d24, d8
1337 vmull.u8 q1, d25, d8
1338 vmull.u8 q6, d26, d8
1339 vmull.u8 q7, d27, d8
1340 vrshr.u16 q10, q0, #8
1341 vrshr.u16 q11, q1, #8
1342 vrshr.u16 q12, q6, #8
1343 vrshr.u16 q13, q7, #8
1344 vraddhn.u16 d0, q0, q10
1345 vraddhn.u16 d1, q1, q11
1346 vraddhn.u16 d2, q6, q12
1347 vraddhn.u16 d3, q7, q13
1350 vmull.u8 q8, d24, d4
1351 vmull.u8 q9, d25, d5
1352 vmull.u8 q10, d26, d6
1353 vmull.u8 q11, d27, d7
1356 .macro pixman_composite_over_n_8_8_process_pixblock_tail
1357 vrshr.u16 q14, q8, #8
1358 vrshr.u16 q15, q9, #8
1359 vrshr.u16 q12, q10, #8
1360 vrshr.u16 q13, q11, #8
1361 vraddhn.u16 d28, q14, q8
1362 vraddhn.u16 d29, q15, q9
1363 vraddhn.u16 d30, q12, q10
1364 vraddhn.u16 d31, q13, q11
1365 vqadd.u8 q14, q0, q14
1366 vqadd.u8 q15, q1, q15
1369 /* TODO: expand macros and do better instructions scheduling */
1370 .macro pixman_composite_over_n_8_8_process_pixblock_tail_head
1371 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1372 pixman_composite_over_n_8_8_process_pixblock_tail
1374 cache_preload 32, 32
1375 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1376 pixman_composite_over_n_8_8_process_pixblock_head
1379 .macro pixman_composite_over_n_8_8_init
1380 add DUMMY, sp, #ARGS_STACK_OFFSET
1382 vld1.32 {d8[0]}, [DUMMY]
1386 .macro pixman_composite_over_n_8_8_cleanup
1390 generate_composite_function \
1391 pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
1392 FLAG_DST_READWRITE, \
1393 32, /* number of pixels, processed in a single block */ \
1394 5, /* prefetch distance */ \
1395 pixman_composite_over_n_8_8_init, \
1396 pixman_composite_over_n_8_8_cleanup, \
1397 pixman_composite_over_n_8_8_process_pixblock_head, \
1398 pixman_composite_over_n_8_8_process_pixblock_tail, \
1399 pixman_composite_over_n_8_8_process_pixblock_tail_head
1401 /******************************************************************************/
1403 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1405 * 'combine_mask_ca' replacement
1407 * input: solid src (n) in {d8, d9, d10, d11}
1408 * dest in {d4, d5, d6, d7 }
1409 * mask in {d24, d25, d26, d27}
1410 * output: updated src in {d0, d1, d2, d3 }
1411 * updated mask in {d24, d25, d26, d3 }
1413 vmull.u8 q0, d24, d8
1414 vmull.u8 q1, d25, d9
1415 vmull.u8 q6, d26, d10
1416 vmull.u8 q7, d27, d11
1417 vmull.u8 q9, d11, d25
1418 vmull.u8 q12, d11, d24
1419 vmull.u8 q13, d11, d26
1420 vrshr.u16 q8, q0, #8
1421 vrshr.u16 q10, q1, #8
1422 vrshr.u16 q11, q6, #8
1423 vraddhn.u16 d0, q0, q8
1424 vraddhn.u16 d1, q1, q10
1425 vraddhn.u16 d2, q6, q11
1426 vrshr.u16 q11, q12, #8
1427 vrshr.u16 q8, q9, #8
1428 vrshr.u16 q6, q13, #8
1429 vrshr.u16 q10, q7, #8
1430 vraddhn.u16 d24, q12, q11
1431 vraddhn.u16 d25, q9, q8
1432 vraddhn.u16 d26, q13, q6
1433 vraddhn.u16 d3, q7, q10
1435 * 'combine_over_ca' replacement
1437 * output: updated dest in {d28, d29, d30, d31}
1441 vmull.u8 q8, d24, d4
1442 vmull.u8 q9, d25, d5
1444 vmull.u8 q10, d26, d6
1445 vmull.u8 q11, d27, d7
1448 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
1449 /* ... continue 'combine_over_ca' replacement */
1450 vrshr.u16 q14, q8, #8
1451 vrshr.u16 q15, q9, #8
1452 vrshr.u16 q6, q10, #8
1453 vrshr.u16 q7, q11, #8
1454 vraddhn.u16 d28, q14, q8
1455 vraddhn.u16 d29, q15, q9
1456 vraddhn.u16 d30, q6, q10
1457 vraddhn.u16 d31, q7, q11
1458 vqadd.u8 q14, q0, q14
1459 vqadd.u8 q15, q1, q15
1462 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1463 vrshr.u16 q14, q8, #8
1464 vrshr.u16 q15, q9, #8
1465 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1466 vrshr.u16 q6, q10, #8
1467 vrshr.u16 q7, q11, #8
1468 vraddhn.u16 d28, q14, q8
1469 vraddhn.u16 d29, q15, q9
1470 vraddhn.u16 d30, q6, q10
1471 vraddhn.u16 d31, q7, q11
1473 vqadd.u8 q14, q0, q14
1474 vqadd.u8 q15, q1, q15
1476 pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1477 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1480 .macro pixman_composite_over_n_8888_8888_ca_init
1481 add DUMMY, sp, #ARGS_STACK_OFFSET
1483 vld1.32 {d11[0]}, [DUMMY]
1490 .macro pixman_composite_over_n_8888_8888_ca_cleanup
1494 generate_composite_function \
1495 pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
1496 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1497 8, /* number of pixels, processed in a single block */ \
1498 5, /* prefetch distance */ \
1499 pixman_composite_over_n_8888_8888_ca_init, \
1500 pixman_composite_over_n_8888_8888_ca_cleanup, \
1501 pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
1502 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
1503 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1505 /******************************************************************************/
1507 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
1509 * 'combine_mask_ca' replacement
1511 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A]
1512 * mask in {d24, d25, d26} [B, G, R]
1513 * output: updated src in {d0, d1, d2 } [B, G, R]
1514 * updated mask in {d24, d25, d26} [B, G, R]
1516 vmull.u8 q0, d24, d8
1517 vmull.u8 q1, d25, d9
1518 vmull.u8 q6, d26, d10
1519 vmull.u8 q9, d11, d25
1520 vmull.u8 q12, d11, d24
1521 vmull.u8 q13, d11, d26
1522 vrshr.u16 q8, q0, #8
1523 vrshr.u16 q10, q1, #8
1524 vrshr.u16 q11, q6, #8
1525 vraddhn.u16 d0, q0, q8
1526 vraddhn.u16 d1, q1, q10
1527 vraddhn.u16 d2, q6, q11
1528 vrshr.u16 q11, q12, #8
1529 vrshr.u16 q8, q9, #8
1530 vrshr.u16 q6, q13, #8
1531 vraddhn.u16 d24, q12, q11
1532 vraddhn.u16 d25, q9, q8
1534 * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
1535 * and put data into d16 - blue, d17 - green, d18 - red
1537 vshrn.u16 d17, q2, #3
1538 vshrn.u16 d18, q2, #8
1539 vraddhn.u16 d26, q13, q6
1541 vsri.u8 d18, d18, #5
1542 vsri.u8 d17, d17, #6
1544 * 'combine_over_ca' replacement
1546 * output: updated dest in d16 - blue, d17 - green, d18 - red
1549 vshrn.u16 d16, q2, #2
1551 vmull.u8 q6, d16, d24
1552 vmull.u8 q7, d17, d25
1553 vmull.u8 q11, d18, d26
1556 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
1557 /* ... continue 'combine_over_ca' replacement */
1558 vrshr.u16 q10, q6, #8
1559 vrshr.u16 q14, q7, #8
1560 vrshr.u16 q15, q11, #8
1561 vraddhn.u16 d16, q10, q6
1562 vraddhn.u16 d17, q14, q7
1563 vraddhn.u16 d18, q15, q11
1565 vqadd.u8 d18, d2, d18
1567 * convert the results in d16, d17, d18 to r5g6b5 and store
1568 * them into {d28, d29}
1570 vshll.u8 q14, d18, #8
1571 vshll.u8 q10, d17, #8
1572 vshll.u8 q15, d16, #8
1573 vsri.u16 q14, q10, #5
1574 vsri.u16 q14, q15, #11
1577 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
1579 vrshr.u16 q10, q6, #8
1580 vrshr.u16 q14, q7, #8
1581 vld1.16 {d4, d5}, [DST_R, :128]!
1582 vrshr.u16 q15, q11, #8
1583 vraddhn.u16 d16, q10, q6
1584 vraddhn.u16 d17, q14, q7
1585 vraddhn.u16 d22, q15, q11
1586 /* process_pixblock_head */
1588 * 'combine_mask_ca' replacement
1590 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A]
1591 * mask in {d24, d25, d26} [B, G, R]
1592 * output: updated src in {d0, d1, d2 } [B, G, R]
1593 * updated mask in {d24, d25, d26} [B, G, R]
1595 vmull.u8 q6, d26, d10
1597 vmull.u8 q0, d24, d8
1598 vqadd.u8 d22, d2, d22
1599 vmull.u8 q1, d25, d9
1601 * convert the result in d16, d17, d22 to r5g6b5 and store
1602 * it into {d28, d29}
1604 vshll.u8 q14, d22, #8
1605 vshll.u8 q10, d17, #8
1606 vshll.u8 q15, d16, #8
1607 vmull.u8 q9, d11, d25
1608 vsri.u16 q14, q10, #5
1609 vmull.u8 q12, d11, d24
1610 vmull.u8 q13, d11, d26
1611 vsri.u16 q14, q15, #11
1613 vrshr.u16 q8, q0, #8
1614 vrshr.u16 q10, q1, #8
1615 vrshr.u16 q11, q6, #8
1616 vraddhn.u16 d0, q0, q8
1617 vraddhn.u16 d1, q1, q10
1618 vraddhn.u16 d2, q6, q11
1619 vrshr.u16 q11, q12, #8
1620 vrshr.u16 q8, q9, #8
1621 vrshr.u16 q6, q13, #8
1622 vraddhn.u16 d24, q12, q11
1623 vraddhn.u16 d25, q9, q8
1625 * convert 8 r5g6b5 pixel data from {d4, d5} to planar
1626 * 8-bit format and put data into d16 - blue, d17 - green,
1629 vshrn.u16 d17, q2, #3
1630 vshrn.u16 d18, q2, #8
1631 vraddhn.u16 d26, q13, q6
1633 vsri.u8 d17, d17, #6
1634 vsri.u8 d18, d18, #5
1636 * 'combine_over_ca' replacement
1638 * output: updated dest in d16 - blue, d17 - green, d18 - red
1641 vshrn.u16 d16, q2, #2
1643 vmull.u8 q7, d17, d25
1644 vmull.u8 q6, d16, d24
1645 vmull.u8 q11, d18, d26
1646 vst1.16 {d28, d29}, [DST_W, :128]!
1649 .macro pixman_composite_over_n_8888_0565_ca_init
1650 add DUMMY, sp, #ARGS_STACK_OFFSET
1652 vld1.32 {d11[0]}, [DUMMY]
1659 .macro pixman_composite_over_n_8888_0565_ca_cleanup
1663 generate_composite_function \
1664 pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
1665 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1666 8, /* number of pixels, processed in a single block */ \
1667 5, /* prefetch distance */ \
1668 pixman_composite_over_n_8888_0565_ca_init, \
1669 pixman_composite_over_n_8888_0565_ca_cleanup, \
1670 pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
1671 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
1672 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
1674 /******************************************************************************/
1676 .macro pixman_composite_in_n_8_process_pixblock_head
1677 /* expecting source data in {d0, d1, d2, d3} */
1678 /* and destination data in {d4, d5, d6, d7} */
1681 vmull.u8 q10, d6, d3
1682 vmull.u8 q11, d7, d3
1685 .macro pixman_composite_in_n_8_process_pixblock_tail
1686 vrshr.u16 q14, q8, #8
1687 vrshr.u16 q15, q9, #8
1688 vrshr.u16 q12, q10, #8
1689 vrshr.u16 q13, q11, #8
1690 vraddhn.u16 d28, q8, q14
1691 vraddhn.u16 d29, q9, q15
1692 vraddhn.u16 d30, q10, q12
1693 vraddhn.u16 d31, q11, q13
1696 .macro pixman_composite_in_n_8_process_pixblock_tail_head
1697 pixman_composite_in_n_8_process_pixblock_tail
1698 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1699 cache_preload 32, 32
1700 pixman_composite_in_n_8_process_pixblock_head
1701 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1704 .macro pixman_composite_in_n_8_init
1705 add DUMMY, sp, #ARGS_STACK_OFFSET
1706 vld1.32 {d3[0]}, [DUMMY]
1710 .macro pixman_composite_in_n_8_cleanup
1713 generate_composite_function \
1714 pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
1715 FLAG_DST_READWRITE, \
1716 32, /* number of pixels, processed in a single block */ \
1717 5, /* prefetch distance */ \
1718 pixman_composite_in_n_8_init, \
1719 pixman_composite_in_n_8_cleanup, \
1720 pixman_composite_in_n_8_process_pixblock_head, \
1721 pixman_composite_in_n_8_process_pixblock_tail, \
1722 pixman_composite_in_n_8_process_pixblock_tail_head, \
1723 28, /* dst_w_basereg */ \
1724 4, /* dst_r_basereg */ \
1725 0, /* src_basereg */ \
1726 24 /* mask_basereg */
1728 .macro pixman_composite_add_n_8_8_process_pixblock_head
1729 /* expecting source data in {d8, d9, d10, d11} */
1730 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1731 /* and destination data in {d4, d5, d6, d7} */
1732 /* mask is in d24, d25, d26, d27 */
1733 vmull.u8 q0, d24, d11
1734 vmull.u8 q1, d25, d11
1735 vmull.u8 q6, d26, d11
1736 vmull.u8 q7, d27, d11
1737 vrshr.u16 q10, q0, #8
1738 vrshr.u16 q11, q1, #8
1739 vrshr.u16 q12, q6, #8
1740 vrshr.u16 q13, q7, #8
1741 vraddhn.u16 d0, q0, q10
1742 vraddhn.u16 d1, q1, q11
1743 vraddhn.u16 d2, q6, q12
1744 vraddhn.u16 d3, q7, q13
1745 vqadd.u8 q14, q0, q2
1746 vqadd.u8 q15, q1, q3
1749 .macro pixman_composite_add_n_8_8_process_pixblock_tail
1752 /* TODO: expand macros and do better instructions scheduling */
1753 .macro pixman_composite_add_n_8_8_process_pixblock_tail_head
1754 pixman_composite_add_n_8_8_process_pixblock_tail
1755 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1756 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1758 cache_preload 32, 32
1759 pixman_composite_add_n_8_8_process_pixblock_head
1762 .macro pixman_composite_add_n_8_8_init
1763 add DUMMY, sp, #ARGS_STACK_OFFSET
1765 vld1.32 {d11[0]}, [DUMMY]
1769 .macro pixman_composite_add_n_8_8_cleanup
1773 generate_composite_function \
1774 pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
1775 FLAG_DST_READWRITE, \
1776 32, /* number of pixels, processed in a single block */ \
1777 5, /* prefetch distance */ \
1778 pixman_composite_add_n_8_8_init, \
1779 pixman_composite_add_n_8_8_cleanup, \
1780 pixman_composite_add_n_8_8_process_pixblock_head, \
1781 pixman_composite_add_n_8_8_process_pixblock_tail, \
1782 pixman_composite_add_n_8_8_process_pixblock_tail_head
1784 /******************************************************************************/
1786 .macro pixman_composite_add_8_8_8_process_pixblock_head
1787 /* expecting source data in {d0, d1, d2, d3} */
1788 /* destination data in {d4, d5, d6, d7} */
1789 /* mask in {d24, d25, d26, d27} */
1790 vmull.u8 q8, d24, d0
1791 vmull.u8 q9, d25, d1
1792 vmull.u8 q10, d26, d2
1793 vmull.u8 q11, d27, d3
1794 vrshr.u16 q0, q8, #8
1795 vrshr.u16 q1, q9, #8
1796 vrshr.u16 q12, q10, #8
1797 vrshr.u16 q13, q11, #8
1798 vraddhn.u16 d0, q0, q8
1799 vraddhn.u16 d1, q1, q9
1800 vraddhn.u16 d2, q12, q10
1801 vraddhn.u16 d3, q13, q11
1802 vqadd.u8 q14, q0, q2
1803 vqadd.u8 q15, q1, q3
1806 .macro pixman_composite_add_8_8_8_process_pixblock_tail
1809 /* TODO: expand macros and do better instructions scheduling */
1810 .macro pixman_composite_add_8_8_8_process_pixblock_tail_head
1811 pixman_composite_add_8_8_8_process_pixblock_tail
1812 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1813 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1816 cache_preload 32, 32
1817 pixman_composite_add_8_8_8_process_pixblock_head
1820 .macro pixman_composite_add_8_8_8_init
1823 .macro pixman_composite_add_8_8_8_cleanup
1826 generate_composite_function \
1827 pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
1828 FLAG_DST_READWRITE, \
1829 32, /* number of pixels, processed in a single block */ \
1830 5, /* prefetch distance */ \
1831 pixman_composite_add_8_8_8_init, \
1832 pixman_composite_add_8_8_8_cleanup, \
1833 pixman_composite_add_8_8_8_process_pixblock_head, \
1834 pixman_composite_add_8_8_8_process_pixblock_tail, \
1835 pixman_composite_add_8_8_8_process_pixblock_tail_head
1837 /******************************************************************************/
1839 .macro pixman_composite_add_8888_8888_8888_process_pixblock_head
1840 /* expecting source data in {d0, d1, d2, d3} */
1841 /* destination data in {d4, d5, d6, d7} */
1842 /* mask in {d24, d25, d26, d27} */
1843 vmull.u8 q8, d27, d0
1844 vmull.u8 q9, d27, d1
1845 vmull.u8 q10, d27, d2
1846 vmull.u8 q11, d27, d3
1847 /* 1 cycle bubble */
1848 vrsra.u16 q8, q8, #8
1849 vrsra.u16 q9, q9, #8
1850 vrsra.u16 q10, q10, #8
1851 vrsra.u16 q11, q11, #8
1854 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
1855 /* 2 cycle bubble */
1856 vrshrn.u16 d28, q8, #8
1857 vrshrn.u16 d29, q9, #8
1858 vrshrn.u16 d30, q10, #8
1859 vrshrn.u16 d31, q11, #8
1860 vqadd.u8 q14, q2, q14
1861 /* 1 cycle bubble */
1862 vqadd.u8 q15, q3, q15
1865 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1867 vrshrn.u16 d28, q8, #8
1869 vrshrn.u16 d29, q9, #8
1870 vmull.u8 q8, d27, d0
1871 vrshrn.u16 d30, q10, #8
1872 vmull.u8 q9, d27, d1
1873 vrshrn.u16 d31, q11, #8
1874 vmull.u8 q10, d27, d2
1875 vqadd.u8 q14, q2, q14
1876 vmull.u8 q11, d27, d3
1877 vqadd.u8 q15, q3, q15
1878 vrsra.u16 q8, q8, #8
1879 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1880 vrsra.u16 q9, q9, #8
1881 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1882 vrsra.u16 q10, q10, #8
1886 vrsra.u16 q11, q11, #8
1889 generate_composite_function \
1890 pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
1891 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1892 8, /* number of pixels, processed in a single block */ \
1893 10, /* prefetch distance */ \
1896 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1897 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1898 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1900 generate_composite_function_single_scanline \
1901 pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
1902 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1903 8, /* number of pixels, processed in a single block */ \
1906 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1907 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1908 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1910 /******************************************************************************/
1912 generate_composite_function \
1913 pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
1914 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1915 8, /* number of pixels, processed in a single block */ \
1916 5, /* prefetch distance */ \
1919 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1920 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1921 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
1922 28, /* dst_w_basereg */ \
1923 4, /* dst_r_basereg */ \
1924 0, /* src_basereg */ \
1925 27 /* mask_basereg */
1927 /******************************************************************************/
1929 .macro pixman_composite_add_n_8_8888_init
1930 add DUMMY, sp, #ARGS_STACK_OFFSET
1931 vld1.32 {d3[0]}, [DUMMY]
1938 .macro pixman_composite_add_n_8_8888_cleanup
1941 generate_composite_function \
1942 pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
1943 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1944 8, /* number of pixels, processed in a single block */ \
1945 5, /* prefetch distance */ \
1946 pixman_composite_add_n_8_8888_init, \
1947 pixman_composite_add_n_8_8888_cleanup, \
1948 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1949 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1950 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
1951 28, /* dst_w_basereg */ \
1952 4, /* dst_r_basereg */ \
1953 0, /* src_basereg */ \
1954 27 /* mask_basereg */
1956 /******************************************************************************/
1958 .macro pixman_composite_add_8888_n_8888_init
1959 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
1960 vld1.32 {d27[0]}, [DUMMY]
1964 .macro pixman_composite_add_8888_n_8888_cleanup
1967 generate_composite_function \
1968 pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
1969 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1970 8, /* number of pixels, processed in a single block */ \
1971 5, /* prefetch distance */ \
1972 pixman_composite_add_8888_n_8888_init, \
1973 pixman_composite_add_8888_n_8888_cleanup, \
1974 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1975 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1976 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
1977 28, /* dst_w_basereg */ \
1978 4, /* dst_r_basereg */ \
1979 0, /* src_basereg */ \
1980 27 /* mask_basereg */
1982 /******************************************************************************/
1984 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
1985 /* expecting source data in {d0, d1, d2, d3} */
1986 /* destination data in {d4, d5, d6, d7} */
1987 /* solid mask is in d15 */
1990 vmull.u8 q8, d15, d3
1991 vmull.u8 q6, d15, d2
1992 vmull.u8 q5, d15, d1
1993 vmull.u8 q4, d15, d0
1994 vrshr.u16 q13, q8, #8
1995 vrshr.u16 q12, q6, #8
1996 vrshr.u16 q11, q5, #8
1997 vrshr.u16 q10, q4, #8
1998 vraddhn.u16 d3, q8, q13
1999 vraddhn.u16 d2, q6, q12
2000 vraddhn.u16 d1, q5, q11
2001 vraddhn.u16 d0, q4, q10
2002 vmvn.8 d24, d3 /* get inverted alpha */
2003 /* now do alpha blending */
2004 vmull.u8 q8, d24, d4
2005 vmull.u8 q9, d24, d5
2006 vmull.u8 q10, d24, d6
2007 vmull.u8 q11, d24, d7
2010 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2011 vrshr.u16 q14, q8, #8
2012 vrshr.u16 q15, q9, #8
2013 vrshr.u16 q12, q10, #8
2014 vrshr.u16 q13, q11, #8
2015 vraddhn.u16 d28, q14, q8
2016 vraddhn.u16 d29, q15, q9
2017 vraddhn.u16 d30, q12, q10
2018 vraddhn.u16 d31, q13, q11
2021 /* TODO: expand macros and do better instructions scheduling */
2022 .macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
2023 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2024 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2028 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
2029 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2032 generate_composite_function_single_scanline \
2033 pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
2034 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2035 8, /* number of pixels, processed in a single block */ \
2036 default_init_need_all_regs, \
2037 default_cleanup_need_all_regs, \
2038 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
2039 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
2040 pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
2041 28, /* dst_w_basereg */ \
2042 4, /* dst_r_basereg */ \
2043 0, /* src_basereg */ \
2044 12 /* mask_basereg */
2046 /******************************************************************************/
2048 .macro pixman_composite_over_8888_n_8888_process_pixblock_head
2049 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
2052 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail
2053 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2054 vqadd.u8 q14, q0, q14
2055 vqadd.u8 q15, q1, q15
2058 /* TODO: expand macros and do better instructions scheduling */
2059 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
2060 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2061 pixman_composite_over_8888_n_8888_process_pixblock_tail
2064 pixman_composite_over_8888_n_8888_process_pixblock_head
2065 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2068 .macro pixman_composite_over_8888_n_8888_init
2071 vld1.32 {d15[0]}, [DUMMY]
2075 .macro pixman_composite_over_8888_n_8888_cleanup
2079 generate_composite_function \
2080 pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
2081 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2082 8, /* number of pixels, processed in a single block */ \
2083 5, /* prefetch distance */ \
2084 pixman_composite_over_8888_n_8888_init, \
2085 pixman_composite_over_8888_n_8888_cleanup, \
2086 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2087 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2088 pixman_composite_over_8888_n_8888_process_pixblock_tail_head
2090 /******************************************************************************/
2092 /* TODO: expand macros and do better instructions scheduling */
2093 .macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
2094 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2095 pixman_composite_over_8888_n_8888_process_pixblock_tail
2099 pixman_composite_over_8888_n_8888_process_pixblock_head
2100 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2103 generate_composite_function \
2104 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
2105 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2106 8, /* number of pixels, processed in a single block */ \
2107 5, /* prefetch distance */ \
2108 default_init_need_all_regs, \
2109 default_cleanup_need_all_regs, \
2110 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2111 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2112 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
2113 28, /* dst_w_basereg */ \
2114 4, /* dst_r_basereg */ \
2115 0, /* src_basereg */ \
2116 12 /* mask_basereg */
2118 generate_composite_function_single_scanline \
2119 pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
2120 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2121 8, /* number of pixels, processed in a single block */ \
2122 default_init_need_all_regs, \
2123 default_cleanup_need_all_regs, \
2124 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2125 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2126 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
2127 28, /* dst_w_basereg */ \
2128 4, /* dst_r_basereg */ \
2129 0, /* src_basereg */ \
2130 12 /* mask_basereg */
2132 /******************************************************************************/
2134 /* TODO: expand macros and do better instructions scheduling */
2135 .macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
2136 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2137 pixman_composite_over_8888_n_8888_process_pixblock_tail
2141 pixman_composite_over_8888_n_8888_process_pixblock_head
2142 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2145 generate_composite_function \
2146 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
2147 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2148 8, /* number of pixels, processed in a single block */ \
2149 5, /* prefetch distance */ \
2150 default_init_need_all_regs, \
2151 default_cleanup_need_all_regs, \
2152 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2153 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2154 pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
2155 28, /* dst_w_basereg */ \
2156 4, /* dst_r_basereg */ \
2157 0, /* src_basereg */ \
2158 15 /* mask_basereg */
2160 /******************************************************************************/
2162 .macro pixman_composite_src_0888_0888_process_pixblock_head
2165 .macro pixman_composite_src_0888_0888_process_pixblock_tail
2168 .macro pixman_composite_src_0888_0888_process_pixblock_tail_head
2169 vst3.8 {d0, d1, d2}, [DST_W]!
2174 generate_composite_function \
2175 pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
2176 FLAG_DST_WRITEONLY, \
2177 8, /* number of pixels, processed in a single block */ \
2178 10, /* prefetch distance */ \
2181 pixman_composite_src_0888_0888_process_pixblock_head, \
2182 pixman_composite_src_0888_0888_process_pixblock_tail, \
2183 pixman_composite_src_0888_0888_process_pixblock_tail_head, \
2184 0, /* dst_w_basereg */ \
2185 0, /* dst_r_basereg */ \
2186 0, /* src_basereg */ \
2187 0 /* mask_basereg */
2189 /******************************************************************************/
2191 .macro pixman_composite_src_0888_8888_rev_process_pixblock_head
2195 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
2198 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
2199 vst4.8 {d0, d1, d2, d3}, [DST_W]!
2205 .macro pixman_composite_src_0888_8888_rev_init
2209 generate_composite_function \
2210 pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
2211 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2212 8, /* number of pixels, processed in a single block */ \
2213 10, /* prefetch distance */ \
2214 pixman_composite_src_0888_8888_rev_init, \
2216 pixman_composite_src_0888_8888_rev_process_pixblock_head, \
2217 pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
2218 pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
2219 0, /* dst_w_basereg */ \
2220 0, /* dst_r_basereg */ \
2221 0, /* src_basereg */ \
2222 0 /* mask_basereg */
2224 /******************************************************************************/
2226 .macro pixman_composite_src_0888_0565_rev_process_pixblock_head
2231 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
2232 vshll.u8 q14, d0, #8
2233 vsri.u16 q14, q8, #5
2234 vsri.u16 q14, q9, #11
2237 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
2238 vshll.u8 q14, d0, #8
2240 vsri.u16 q14, q8, #5
2241 vsri.u16 q14, q9, #11
2243 vst1.16 {d28, d29}, [DST_W, :128]!
2247 generate_composite_function \
2248 pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
2249 FLAG_DST_WRITEONLY, \
2250 8, /* number of pixels, processed in a single block */ \
2251 10, /* prefetch distance */ \
2254 pixman_composite_src_0888_0565_rev_process_pixblock_head, \
2255 pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
2256 pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
2257 28, /* dst_w_basereg */ \
2258 0, /* dst_r_basereg */ \
2259 0, /* src_basereg */ \
2260 0 /* mask_basereg */
2262 /******************************************************************************/
2264 .macro pixman_composite_src_pixbuf_8888_process_pixblock_head
2267 vmull.u8 q10, d3, d2
2270 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
2271 vrshr.u16 q11, q8, #8
2273 vrshr.u16 q12, q9, #8
2274 vrshr.u16 q13, q10, #8
2275 vraddhn.u16 d30, q11, q8
2276 vraddhn.u16 d29, q12, q9
2277 vraddhn.u16 d28, q13, q10
2280 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
2281 vrshr.u16 q11, q8, #8
2283 vrshr.u16 q12, q9, #8
2284 vrshr.u16 q13, q10, #8
2286 vraddhn.u16 d30, q11, q8
2287 PF add PF_X, PF_X, #8
2289 PF addne PF_X, PF_X, #8
2290 PF subne PF_CTL, PF_CTL, #1
2291 vraddhn.u16 d29, q12, q9
2292 vraddhn.u16 d28, q13, q10
2295 vmull.u8 q10, d3, d2
2296 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2298 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
2299 PF subge PF_X, PF_X, ORIG_W
2300 PF subges PF_CTL, PF_CTL, #0x10
2301 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2304 generate_composite_function \
2305 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
2306 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2307 8, /* number of pixels, processed in a single block */ \
2308 10, /* prefetch distance */ \
2311 pixman_composite_src_pixbuf_8888_process_pixblock_head, \
2312 pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
2313 pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
2314 28, /* dst_w_basereg */ \
2315 0, /* dst_r_basereg */ \
2316 0, /* src_basereg */ \
2317 0 /* mask_basereg */
2319 /******************************************************************************/
2321 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
2324 vmull.u8 q10, d3, d2
2327 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
2328 vrshr.u16 q11, q8, #8
2330 vrshr.u16 q12, q9, #8
2331 vrshr.u16 q13, q10, #8
2332 vraddhn.u16 d28, q11, q8
2333 vraddhn.u16 d29, q12, q9
2334 vraddhn.u16 d30, q13, q10
2337 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
2338 vrshr.u16 q11, q8, #8
2340 vrshr.u16 q12, q9, #8
2341 vrshr.u16 q13, q10, #8
2343 vraddhn.u16 d28, q11, q8
2344 PF add PF_X, PF_X, #8
2346 PF addne PF_X, PF_X, #8
2347 PF subne PF_CTL, PF_CTL, #1
2348 vraddhn.u16 d29, q12, q9
2349 vraddhn.u16 d30, q13, q10
2352 vmull.u8 q10, d3, d2
2353 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2355 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
2356 PF subge PF_X, PF_X, ORIG_W
2357 PF subges PF_CTL, PF_CTL, #0x10
2358 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2361 generate_composite_function \
2362 pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
2363 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2364 8, /* number of pixels, processed in a single block */ \
2365 10, /* prefetch distance */ \
2368 pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
2369 pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
2370 pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
2371 28, /* dst_w_basereg */ \
2372 0, /* dst_r_basereg */ \
2373 0, /* src_basereg */ \
2374 0 /* mask_basereg */
2376 /******************************************************************************/
2378 .macro pixman_composite_over_0565_8_0565_process_pixblock_head
2379 /* mask is in d15 */
2380 convert_0565_to_x888 q4, d2, d1, d0
2381 convert_0565_to_x888 q5, d6, d5, d4
2382 /* source pixel data is in {d0, d1, d2, XX} */
2383 /* destination pixel data is in {d4, d5, d6, XX} */
2385 vmull.u8 q6, d15, d2
2386 vmull.u8 q5, d15, d1
2387 vmull.u8 q4, d15, d0
2390 vmull.u8 q13, d7, d6
2391 vrshr.u16 q12, q6, #8
2392 vrshr.u16 q11, q5, #8
2393 vrshr.u16 q10, q4, #8
2394 vraddhn.u16 d2, q6, q12
2395 vraddhn.u16 d1, q5, q11
2396 vraddhn.u16 d0, q4, q10
2399 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail
2400 vrshr.u16 q14, q8, #8
2401 vrshr.u16 q15, q9, #8
2402 vrshr.u16 q12, q13, #8
2403 vraddhn.u16 d28, q14, q8
2404 vraddhn.u16 d29, q15, q9
2405 vraddhn.u16 d30, q12, q13
2406 vqadd.u8 q0, q0, q14
2407 vqadd.u8 q1, q1, q15
2408 /* 32bpp result is in {d0, d1, d2, XX} */
2409 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2412 /* TODO: expand macros and do better instructions scheduling */
2413 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
2415 pixman_composite_over_0565_8_0565_process_pixblock_tail
2417 vld1.16 {d10, d11}, [DST_R, :128]!
2419 pixman_composite_over_0565_8_0565_process_pixblock_head
2420 vst1.16 {d28, d29}, [DST_W, :128]!
2423 generate_composite_function \
2424 pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
2425 FLAG_DST_READWRITE, \
2426 8, /* number of pixels, processed in a single block */ \
2427 5, /* prefetch distance */ \
2428 default_init_need_all_regs, \
2429 default_cleanup_need_all_regs, \
2430 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2431 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2432 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2433 28, /* dst_w_basereg */ \
2434 10, /* dst_r_basereg */ \
2435 8, /* src_basereg */ \
2436 15 /* mask_basereg */
2438 /******************************************************************************/
2440 .macro pixman_composite_over_0565_n_0565_init
2441 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
2443 vld1.32 {d15[0]}, [DUMMY]
2447 .macro pixman_composite_over_0565_n_0565_cleanup
2451 generate_composite_function \
2452 pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
2453 FLAG_DST_READWRITE, \
2454 8, /* number of pixels, processed in a single block */ \
2455 5, /* prefetch distance */ \
2456 pixman_composite_over_0565_n_0565_init, \
2457 pixman_composite_over_0565_n_0565_cleanup, \
2458 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2459 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2460 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2461 28, /* dst_w_basereg */ \
2462 10, /* dst_r_basereg */ \
2463 8, /* src_basereg */ \
2464 15 /* mask_basereg */
2466 /******************************************************************************/
2468 .macro pixman_composite_add_0565_8_0565_process_pixblock_head
2469 /* mask is in d15 */
2470 convert_0565_to_x888 q4, d2, d1, d0
2471 convert_0565_to_x888 q5, d6, d5, d4
2472 /* source pixel data is in {d0, d1, d2, XX} */
2473 /* destination pixel data is in {d4, d5, d6, XX} */
2474 vmull.u8 q6, d15, d2
2475 vmull.u8 q5, d15, d1
2476 vmull.u8 q4, d15, d0
2477 vrshr.u16 q12, q6, #8
2478 vrshr.u16 q11, q5, #8
2479 vrshr.u16 q10, q4, #8
2480 vraddhn.u16 d2, q6, q12
2481 vraddhn.u16 d1, q5, q11
2482 vraddhn.u16 d0, q4, q10
2485 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail
2488 /* 32bpp result is in {d0, d1, d2, XX} */
2489 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2492 /* TODO: expand macros and do better instructions scheduling */
2493 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
2495 pixman_composite_add_0565_8_0565_process_pixblock_tail
2497 vld1.16 {d10, d11}, [DST_R, :128]!
2499 pixman_composite_add_0565_8_0565_process_pixblock_head
2500 vst1.16 {d28, d29}, [DST_W, :128]!
2503 generate_composite_function \
2504 pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
2505 FLAG_DST_READWRITE, \
2506 8, /* number of pixels, processed in a single block */ \
2507 5, /* prefetch distance */ \
2508 default_init_need_all_regs, \
2509 default_cleanup_need_all_regs, \
2510 pixman_composite_add_0565_8_0565_process_pixblock_head, \
2511 pixman_composite_add_0565_8_0565_process_pixblock_tail, \
2512 pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
2513 28, /* dst_w_basereg */ \
2514 10, /* dst_r_basereg */ \
2515 8, /* src_basereg */ \
2516 15 /* mask_basereg */
2518 /******************************************************************************/
2520 .macro pixman_composite_out_reverse_8_0565_process_pixblock_head
2521 /* mask is in d15 */
2522 convert_0565_to_x888 q5, d6, d5, d4
2523 /* destination pixel data is in {d4, d5, d6, xx} */
2524 vmvn.8 d24, d15 /* get inverted alpha */
2525 /* now do alpha blending */
2526 vmull.u8 q8, d24, d4
2527 vmull.u8 q9, d24, d5
2528 vmull.u8 q10, d24, d6
2531 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
2532 vrshr.u16 q14, q8, #8
2533 vrshr.u16 q15, q9, #8
2534 vrshr.u16 q12, q10, #8
2535 vraddhn.u16 d0, q14, q8
2536 vraddhn.u16 d1, q15, q9
2537 vraddhn.u16 d2, q12, q10
2538 /* 32bpp result is in {d0, d1, d2, XX} */
2539 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2542 /* TODO: expand macros and do better instructions scheduling */
2543 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
2545 pixman_composite_out_reverse_8_0565_process_pixblock_tail
2546 vld1.16 {d10, d11}, [DST_R, :128]!
2548 pixman_composite_out_reverse_8_0565_process_pixblock_head
2549 vst1.16 {d28, d29}, [DST_W, :128]!
2552 generate_composite_function \
2553 pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
2554 FLAG_DST_READWRITE, \
2555 8, /* number of pixels, processed in a single block */ \
2556 5, /* prefetch distance */ \
2557 default_init_need_all_regs, \
2558 default_cleanup_need_all_regs, \
2559 pixman_composite_out_reverse_8_0565_process_pixblock_head, \
2560 pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
2561 pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
2562 28, /* dst_w_basereg */ \
2563 10, /* dst_r_basereg */ \
2564 15, /* src_basereg */ \
2565 0 /* mask_basereg */
2567 /******************************************************************************/
2569 .macro pixman_composite_out_reverse_8_8888_process_pixblock_head
2571 /* destination pixel data is in {d4, d5, d6, d7} */
2572 vmvn.8 d1, d0 /* get inverted alpha */
2573 /* now do alpha blending */
2576 vmull.u8 q10, d1, d6
2577 vmull.u8 q11, d1, d7
2580 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
2581 vrshr.u16 q14, q8, #8
2582 vrshr.u16 q15, q9, #8
2583 vrshr.u16 q12, q10, #8
2584 vrshr.u16 q13, q11, #8
2585 vraddhn.u16 d28, q14, q8
2586 vraddhn.u16 d29, q15, q9
2587 vraddhn.u16 d30, q12, q10
2588 vraddhn.u16 d31, q13, q11
2589 /* 32bpp result is in {d28, d29, d30, d31} */
2592 /* TODO: expand macros and do better instructions scheduling */
2593 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
2595 pixman_composite_out_reverse_8_8888_process_pixblock_tail
2596 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2598 pixman_composite_out_reverse_8_8888_process_pixblock_head
2599 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2602 generate_composite_function \
2603 pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
2604 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2605 8, /* number of pixels, processed in a single block */ \
2606 5, /* prefetch distance */ \
2609 pixman_composite_out_reverse_8_8888_process_pixblock_head, \
2610 pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
2611 pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
2612 28, /* dst_w_basereg */ \
2613 4, /* dst_r_basereg */ \
2614 0, /* src_basereg */ \
2615 0 /* mask_basereg */
2617 /******************************************************************************/
2619 generate_composite_function_nearest_scanline \
2620 pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
2621 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2622 8, /* number of pixels, processed in a single block */ \
2625 pixman_composite_over_8888_8888_process_pixblock_head, \
2626 pixman_composite_over_8888_8888_process_pixblock_tail, \
2627 pixman_composite_over_8888_8888_process_pixblock_tail_head
2629 generate_composite_function_nearest_scanline \
2630 pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
2631 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2632 8, /* number of pixels, processed in a single block */ \
2635 pixman_composite_over_8888_0565_process_pixblock_head, \
2636 pixman_composite_over_8888_0565_process_pixblock_tail, \
2637 pixman_composite_over_8888_0565_process_pixblock_tail_head, \
2638 28, /* dst_w_basereg */ \
2639 4, /* dst_r_basereg */ \
2640 0, /* src_basereg */ \
2641 24 /* mask_basereg */
2643 generate_composite_function_nearest_scanline \
2644 pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
2645 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2646 8, /* number of pixels, processed in a single block */ \
2649 pixman_composite_src_8888_0565_process_pixblock_head, \
2650 pixman_composite_src_8888_0565_process_pixblock_tail, \
2651 pixman_composite_src_8888_0565_process_pixblock_tail_head
2653 generate_composite_function_nearest_scanline \
2654 pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
2655 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2656 8, /* number of pixels, processed in a single block */ \
2659 pixman_composite_src_0565_8888_process_pixblock_head, \
2660 pixman_composite_src_0565_8888_process_pixblock_tail, \
2661 pixman_composite_src_0565_8888_process_pixblock_tail_head
2663 generate_composite_function_nearest_scanline \
2664 pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
2665 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2666 8, /* number of pixels, processed in a single block */ \
2667 default_init_need_all_regs, \
2668 default_cleanup_need_all_regs, \
2669 pixman_composite_over_8888_8_0565_process_pixblock_head, \
2670 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
2671 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
2672 28, /* dst_w_basereg */ \
2673 4, /* dst_r_basereg */ \
2674 8, /* src_basereg */ \
2675 24 /* mask_basereg */
2677 generate_composite_function_nearest_scanline \
2678 pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
2679 FLAG_DST_READWRITE, \
2680 8, /* number of pixels, processed in a single block */ \
2681 default_init_need_all_regs, \
2682 default_cleanup_need_all_regs, \
2683 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2684 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2685 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2686 28, /* dst_w_basereg */ \
2687 10, /* dst_r_basereg */ \
2688 8, /* src_basereg */ \
2689 15 /* mask_basereg */
2691 /******************************************************************************/
2693 /* Supplementary macro for setting function attributes */
2694 .macro pixman_asm_function fname
2699 .type fname, %function
2705 * Bilinear scaling support code which tries to provide pixel fetching, color
2706 * format conversion, and interpolation as separate macros which can be used
2707 * as the basic building blocks for constructing bilinear scanline functions.
2710 .macro bilinear_load_8888 reg1, reg2, tmp
2711 mov TMP1, X, asr #16
2713 add TMP1, TOP, TMP1, asl #2
2714 vld1.32 {reg1}, [TMP1], STRIDE
2715 vld1.32 {reg2}, [TMP1]
2718 .macro bilinear_load_0565 reg1, reg2, tmp
2719 mov TMP1, X, asr #16
2721 add TMP1, TOP, TMP1, asl #1
2722 vld1.32 {reg2[0]}, [TMP1], STRIDE
2723 vld1.32 {reg2[1]}, [TMP1]
2724 convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
2727 .macro bilinear_load_and_vertical_interpolate_two_8888 \
2728 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
2730 bilinear_load_8888 reg1, reg2, tmp1
2731 vmull.u8 acc1, reg1, d28
2732 vmlal.u8 acc1, reg2, d29
2733 bilinear_load_8888 reg3, reg4, tmp2
2734 vmull.u8 acc2, reg3, d28
2735 vmlal.u8 acc2, reg4, d29
2738 .macro bilinear_load_and_vertical_interpolate_four_8888 \
2739 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2740 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2742 bilinear_load_and_vertical_interpolate_two_8888 \
2743 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
2744 bilinear_load_and_vertical_interpolate_two_8888 \
2745 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2748 .macro bilinear_load_and_vertical_interpolate_two_0565 \
2749 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
2751 mov TMP1, X, asr #16
2753 add TMP1, TOP, TMP1, asl #1
2754 mov TMP2, X, asr #16
2756 add TMP2, TOP, TMP2, asl #1
2757 vld1.32 {acc2lo[0]}, [TMP1], STRIDE
2758 vld1.32 {acc2hi[0]}, [TMP2], STRIDE
2759 vld1.32 {acc2lo[1]}, [TMP1]
2760 vld1.32 {acc2hi[1]}, [TMP2]
2761 convert_0565_to_x888 acc2, reg3, reg2, reg1
2766 vmull.u8 acc1, reg1, d28
2767 vmlal.u8 acc1, reg2, d29
2768 vmull.u8 acc2, reg3, d28
2769 vmlal.u8 acc2, reg4, d29
2772 .macro bilinear_load_and_vertical_interpolate_four_0565 \
2773 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2774 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2776 mov TMP1, X, asr #16
2778 add TMP1, TOP, TMP1, asl #1
2779 mov TMP2, X, asr #16
2781 add TMP2, TOP, TMP2, asl #1
2782 vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
2783 vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
2784 vld1.32 {xacc2lo[1]}, [TMP1]
2785 vld1.32 {xacc2hi[1]}, [TMP2]
2786 convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
2787 mov TMP1, X, asr #16
2789 add TMP1, TOP, TMP1, asl #1
2790 mov TMP2, X, asr #16
2792 add TMP2, TOP, TMP2, asl #1
2793 vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
2794 vzip.u8 xreg1, xreg3
2795 vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
2796 vzip.u8 xreg2, xreg4
2797 vld1.32 {yacc2lo[1]}, [TMP1]
2798 vzip.u8 xreg3, xreg4
2799 vld1.32 {yacc2hi[1]}, [TMP2]
2800 vzip.u8 xreg1, xreg2
2801 convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
2802 vmull.u8 xacc1, xreg1, d28
2803 vzip.u8 yreg1, yreg3
2804 vmlal.u8 xacc1, xreg2, d29
2805 vzip.u8 yreg2, yreg4
2806 vmull.u8 xacc2, xreg3, d28
2807 vzip.u8 yreg3, yreg4
2808 vmlal.u8 xacc2, xreg4, d29
2809 vzip.u8 yreg1, yreg2
2810 vmull.u8 yacc1, yreg1, d28
2811 vmlal.u8 yacc1, yreg2, d29
2812 vmull.u8 yacc2, yreg3, d28
2813 vmlal.u8 yacc2, yreg4, d29
2816 .macro bilinear_store_8888 numpix, tmp1, tmp2
2818 vst1.32 {d0, d1}, [OUT, :128]!
2820 vst1.32 {d0}, [OUT, :64]!
2822 vst1.32 {d0[0]}, [OUT, :32]!
2824 .error bilinear_store_8888 numpix is unsupported
2828 .macro bilinear_store_0565 numpix, tmp1, tmp2
2833 convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
2835 vst1.16 {d2}, [OUT, :64]!
2837 vst1.32 {d2[0]}, [OUT, :32]!
2839 vst1.16 {d2[0]}, [OUT, :16]!
2841 .error bilinear_store_0565 numpix is unsupported
2845 .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
2846 bilinear_load_&src_fmt d0, d1, d2
2847 vmull.u8 q1, d0, d28
2848 vmlal.u8 q1, d1, d29
2849 /* 5 cycles bubble */
2850 vshll.u16 q0, d2, #8
2851 vmlsl.u16 q0, d2, d30
2852 vmlal.u16 q0, d3, d30
2853 /* 5 cycles bubble */
2854 vshrn.u32 d0, q0, #16
2855 /* 3 cycles bubble */
2857 /* 1 cycle bubble */
2858 bilinear_store_&dst_fmt 1, q2, q3
2861 .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
2862 bilinear_load_and_vertical_interpolate_two_&src_fmt \
2863 q1, q11, d0, d1, d20, d21, d22, d23
2864 vshll.u16 q0, d2, #8
2865 vmlsl.u16 q0, d2, d30
2866 vmlal.u16 q0, d3, d30
2867 vshll.u16 q10, d22, #8
2868 vmlsl.u16 q10, d22, d31
2869 vmlal.u16 q10, d23, d31
2870 vshrn.u32 d0, q0, #16
2871 vshrn.u32 d1, q10, #16
2872 vshr.u16 q15, q12, #8
2873 vadd.u16 q12, q12, q13
2875 bilinear_store_&dst_fmt 2, q2, q3
2878 .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
2879 bilinear_load_and_vertical_interpolate_four_&src_fmt \
2880 q1, q11, d0, d1, d20, d21, d22, d23 \
2881 q3, q9, d4, d5, d16, d17, d18, d19
2883 sub TMP1, TMP1, STRIDE
2884 vshll.u16 q0, d2, #8
2885 vmlsl.u16 q0, d2, d30
2886 vmlal.u16 q0, d3, d30
2887 vshll.u16 q10, d22, #8
2888 vmlsl.u16 q10, d22, d31
2889 vmlal.u16 q10, d23, d31
2890 vshr.u16 q15, q12, #8
2891 vshll.u16 q2, d6, #8
2892 vmlsl.u16 q2, d6, d30
2893 vmlal.u16 q2, d7, d30
2894 vshll.u16 q8, d18, #8
2896 vmlsl.u16 q8, d18, d31
2897 vmlal.u16 q8, d19, d31
2898 vadd.u16 q12, q12, q13
2899 vshrn.u32 d0, q0, #16
2900 vshrn.u32 d1, q10, #16
2901 vshrn.u32 d4, q2, #16
2902 vshrn.u32 d5, q8, #16
2903 vshr.u16 q15, q12, #8
2906 vadd.u16 q12, q12, q13
2907 bilinear_store_&dst_fmt 4, q2, q3
2910 .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
2911 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
2912 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
2914 bilinear_interpolate_four_pixels src_fmt, dst_fmt
2918 .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
2919 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
2920 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
2924 .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
2925 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
2926 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
2928 bilinear_interpolate_four_pixels src_fmt, dst_fmt
2932 .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
2933 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
2934 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
2936 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
2937 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
2941 .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
2942 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
2943 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
2945 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
2949 .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
2950 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
2951 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
2953 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
2954 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
2958 .set BILINEAR_FLAG_UNROLL_4, 0
2959 .set BILINEAR_FLAG_UNROLL_8, 1
2960 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
2963 * Main template macro for generating NEON optimized bilinear scanline
2966 * Bilinear scanline scaler macro template uses the following arguments:
2967 * fname - name of the function to generate
2968 * src_fmt - source color format (8888 or 0565)
2969 * dst_fmt - destination color format (8888 or 0565)
2970 * bpp_shift - (1 << bpp_shift) is the size of source pixel in bytes
2971 * prefetch_distance - prefetch in the source image by that many
2975 .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
2976 src_bpp_shift, dst_bpp_shift, \
2977 prefetch_distance, flags
2979 pixman_asm_function fname
2996 push {r4, r5, r6, r7, r8, r9}
2997 mov PF_OFFS, #prefetch_distance
2998 ldmia ip, {WB, X, UX, WIDTH}
2999 mul PF_OFFS, PF_OFFS, UX
3001 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
3005 sub STRIDE, BOTTOM, TOP
3015 vadd.u16 d25, d25, d26
3017 /* ensure good destination alignment */
3020 tst OUT, #(1 << dst_bpp_shift)
3022 vshr.u16 q15, q12, #8
3023 vadd.u16 q12, q12, q13
3024 bilinear_interpolate_last_pixel src_fmt, dst_fmt
3025 sub WIDTH, WIDTH, #1
3027 vadd.u16 q13, q13, q13
3028 vshr.u16 q15, q12, #8
3029 vadd.u16 q12, q12, q13
3033 tst OUT, #(1 << (dst_bpp_shift + 1))
3035 bilinear_interpolate_two_pixels src_fmt, dst_fmt
3036 sub WIDTH, WIDTH, #2
3038 .if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
3039 /*********** 8 pixels per iteration *****************/
3042 tst OUT, #(1 << (dst_bpp_shift + 2))
3044 bilinear_interpolate_four_pixels src_fmt, dst_fmt
3045 sub WIDTH, WIDTH, #4
3047 subs WIDTH, WIDTH, #8
3049 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
3050 bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
3051 subs WIDTH, WIDTH, #8
3054 bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
3055 subs WIDTH, WIDTH, #8
3058 bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
3062 bilinear_interpolate_four_pixels src_fmt, dst_fmt
3065 /*********** 4 pixels per iteration *****************/
3066 subs WIDTH, WIDTH, #4
3068 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
3069 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
3070 subs WIDTH, WIDTH, #4
3073 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3074 subs WIDTH, WIDTH, #4
3077 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
3079 /****************************************************/
3081 /* handle the remaining trailing pixels */
3084 bilinear_interpolate_two_pixels src_fmt, dst_fmt
3088 bilinear_interpolate_last_pixel src_fmt, dst_fmt
3090 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
3093 pop {r4, r5, r6, r7, r8, r9}
3113 /*****************************************************************************/
3115 .set have_bilinear_interpolate_four_pixels_8888_8888, 1
3117 .macro bilinear_interpolate_four_pixels_8888_8888_head
3118 mov TMP1, X, asr #16
3120 add TMP1, TOP, TMP1, asl #2
3121 mov TMP2, X, asr #16
3123 add TMP2, TOP, TMP2, asl #2
3125 vld1.32 {d22}, [TMP1], STRIDE
3126 vld1.32 {d23}, [TMP1]
3127 mov TMP3, X, asr #16
3129 add TMP3, TOP, TMP3, asl #2
3130 vmull.u8 q8, d22, d28
3131 vmlal.u8 q8, d23, d29
3133 vld1.32 {d22}, [TMP2], STRIDE
3134 vld1.32 {d23}, [TMP2]
3135 mov TMP4, X, asr #16
3137 add TMP4, TOP, TMP4, asl #2
3138 vmull.u8 q9, d22, d28
3139 vmlal.u8 q9, d23, d29
3141 vld1.32 {d22}, [TMP3], STRIDE
3142 vld1.32 {d23}, [TMP3]
3143 vmull.u8 q10, d22, d28
3144 vmlal.u8 q10, d23, d29
3146 vshll.u16 q0, d16, #8
3147 vmlsl.u16 q0, d16, d30
3148 vmlal.u16 q0, d17, d30
3151 vld1.32 {d16}, [TMP4], STRIDE
3152 vld1.32 {d17}, [TMP4]
3154 vmull.u8 q11, d16, d28
3155 vmlal.u8 q11, d17, d29
3157 vshll.u16 q1, d18, #8
3158 vmlsl.u16 q1, d18, d31
3161 .macro bilinear_interpolate_four_pixels_8888_8888_tail
3162 vmlal.u16 q1, d19, d31
3163 vshr.u16 q15, q12, #8
3164 vshll.u16 q2, d20, #8
3165 vmlsl.u16 q2, d20, d30
3166 vmlal.u16 q2, d21, d30
3167 vshll.u16 q3, d22, #8
3168 vmlsl.u16 q3, d22, d31
3169 vmlal.u16 q3, d23, d31
3170 vadd.u16 q12, q12, q13
3171 vshrn.u32 d0, q0, #16
3172 vshrn.u32 d1, q1, #16
3173 vshrn.u32 d4, q2, #16
3174 vshr.u16 q15, q12, #8
3175 vshrn.u32 d5, q3, #16
3178 vadd.u16 q12, q12, q13
3179 vst1.32 {d6, d7}, [OUT, :128]!
3182 .macro bilinear_interpolate_four_pixels_8888_8888_tail_head
3183 mov TMP1, X, asr #16
3185 add TMP1, TOP, TMP1, asl #2
3186 mov TMP2, X, asr #16
3188 add TMP2, TOP, TMP2, asl #2
3189 vmlal.u16 q1, d19, d31
3190 vshr.u16 q15, q12, #8
3191 vshll.u16 q2, d20, #8
3192 vmlsl.u16 q2, d20, d30
3193 vmlal.u16 q2, d21, d30
3194 vshll.u16 q3, d22, #8
3195 vld1.32 {d20}, [TMP1], STRIDE
3196 vmlsl.u16 q3, d22, d31
3197 vmlal.u16 q3, d23, d31
3198 vld1.32 {d21}, [TMP1]
3199 vmull.u8 q8, d20, d28
3200 vmlal.u8 q8, d21, d29
3201 vshrn.u32 d0, q0, #16
3202 vshrn.u32 d1, q1, #16
3203 vshrn.u32 d4, q2, #16
3204 vld1.32 {d22}, [TMP2], STRIDE
3205 vshrn.u32 d5, q3, #16
3206 vadd.u16 q12, q12, q13
3207 vld1.32 {d23}, [TMP2]
3208 vmull.u8 q9, d22, d28
3209 mov TMP3, X, asr #16
3211 add TMP3, TOP, TMP3, asl #2
3212 mov TMP4, X, asr #16
3214 add TMP4, TOP, TMP4, asl #2
3215 vmlal.u8 q9, d23, d29
3216 vld1.32 {d22}, [TMP3], STRIDE
3217 vshr.u16 q15, q12, #8
3218 vld1.32 {d23}, [TMP3]
3219 vmull.u8 q10, d22, d28
3220 vmlal.u8 q10, d23, d29
3222 vshll.u16 q0, d16, #8
3224 vmlsl.u16 q0, d16, d30
3225 vmlal.u16 q0, d17, d30
3227 vld1.32 {d16}, [TMP4], STRIDE
3228 vadd.u16 q12, q12, q13
3229 vld1.32 {d17}, [TMP4]
3231 vmull.u8 q11, d16, d28
3232 vmlal.u8 q11, d17, d29
3233 vst1.32 {d6, d7}, [OUT, :128]!
3234 vshll.u16 q1, d18, #8
3235 vmlsl.u16 q1, d18, d31
3238 /*****************************************************************************/
3240 .set have_bilinear_interpolate_eight_pixels_8888_0565, 1
3242 .macro bilinear_interpolate_eight_pixels_8888_0565_head
3243 mov TMP1, X, asr #16
3245 add TMP1, TOP, TMP1, asl #2
3246 mov TMP2, X, asr #16
3248 add TMP2, TOP, TMP2, asl #2
3249 vld1.32 {d20}, [TMP1], STRIDE
3250 vld1.32 {d21}, [TMP1]
3251 vmull.u8 q8, d20, d28
3252 vmlal.u8 q8, d21, d29
3253 vld1.32 {d22}, [TMP2], STRIDE
3254 vld1.32 {d23}, [TMP2]
3255 vmull.u8 q9, d22, d28
3256 mov TMP3, X, asr #16
3258 add TMP3, TOP, TMP3, asl #2
3259 mov TMP4, X, asr #16
3261 add TMP4, TOP, TMP4, asl #2
3262 vmlal.u8 q9, d23, d29
3263 vld1.32 {d22}, [TMP3], STRIDE
3264 vld1.32 {d23}, [TMP3]
3265 vmull.u8 q10, d22, d28
3266 vmlal.u8 q10, d23, d29
3267 vshll.u16 q0, d16, #8
3268 vmlsl.u16 q0, d16, d30
3269 vmlal.u16 q0, d17, d30
3271 vld1.32 {d16}, [TMP4], STRIDE
3272 vld1.32 {d17}, [TMP4]
3274 vmull.u8 q11, d16, d28
3275 vmlal.u8 q11, d17, d29
3276 vshll.u16 q1, d18, #8
3277 vmlsl.u16 q1, d18, d31
3279 mov TMP1, X, asr #16
3281 add TMP1, TOP, TMP1, asl #2
3282 mov TMP2, X, asr #16
3284 add TMP2, TOP, TMP2, asl #2
3285 vmlal.u16 q1, d19, d31
3286 vshr.u16 q15, q12, #8
3287 vshll.u16 q2, d20, #8
3288 vmlsl.u16 q2, d20, d30
3289 vmlal.u16 q2, d21, d30
3290 vshll.u16 q3, d22, #8
3291 vld1.32 {d20}, [TMP1], STRIDE
3292 vmlsl.u16 q3, d22, d31
3293 vmlal.u16 q3, d23, d31
3294 vld1.32 {d21}, [TMP1]
3295 vmull.u8 q8, d20, d28
3296 vmlal.u8 q8, d21, d29
3297 vshrn.u32 d0, q0, #16
3298 vshrn.u32 d1, q1, #16
3299 vshrn.u32 d4, q2, #16
3300 vld1.32 {d22}, [TMP2], STRIDE
3301 vshrn.u32 d5, q3, #16
3302 vadd.u16 q12, q12, q13
3303 vld1.32 {d23}, [TMP2]
3304 vmull.u8 q9, d22, d28
3305 mov TMP3, X, asr #16
3307 add TMP3, TOP, TMP3, asl #2
3308 mov TMP4, X, asr #16
3310 add TMP4, TOP, TMP4, asl #2
3311 vmlal.u8 q9, d23, d29
3312 vld1.32 {d22}, [TMP3], STRIDE
3313 vshr.u16 q15, q12, #8
3314 vld1.32 {d23}, [TMP3]
3315 vmull.u8 q10, d22, d28
3316 vmlal.u8 q10, d23, d29
3318 vshll.u16 q0, d16, #8
3320 vmlsl.u16 q0, d16, d30
3321 vmlal.u16 q0, d17, d30
3323 vld1.32 {d16}, [TMP4], STRIDE
3324 vadd.u16 q12, q12, q13
3325 vld1.32 {d17}, [TMP4]
3327 vmull.u8 q11, d16, d28
3328 vmlal.u8 q11, d17, d29
3329 vshll.u16 q1, d18, #8
3330 vmlsl.u16 q1, d18, d31
3333 .macro bilinear_interpolate_eight_pixels_8888_0565_tail
3334 vmlal.u16 q1, d19, d31
3335 vshr.u16 q15, q12, #8
3336 vshll.u16 q2, d20, #8
3337 vmlsl.u16 q2, d20, d30
3338 vmlal.u16 q2, d21, d30
3339 vshll.u16 q3, d22, #8
3340 vmlsl.u16 q3, d22, d31
3341 vmlal.u16 q3, d23, d31
3342 vadd.u16 q12, q12, q13
3343 vshrn.u32 d0, q0, #16
3344 vshrn.u32 d1, q1, #16
3345 vshrn.u32 d4, q2, #16
3346 vshr.u16 q15, q12, #8
3347 vshrn.u32 d5, q3, #16
3350 vadd.u16 q12, q12, q13
3357 vshll.u8 q5, d10, #8
3360 vsri.u16 q5, q7, #11
3361 vst1.32 {d10, d11}, [OUT, :128]!
3364 .macro bilinear_interpolate_eight_pixels_8888_0565_tail_head
3365 mov TMP1, X, asr #16
3367 add TMP1, TOP, TMP1, asl #2
3368 mov TMP2, X, asr #16
3370 add TMP2, TOP, TMP2, asl #2
3371 vmlal.u16 q1, d19, d31
3372 vshr.u16 q15, q12, #8
3374 vshll.u16 q2, d20, #8
3375 vmlsl.u16 q2, d20, d30
3376 vmlal.u16 q2, d21, d30
3377 vshll.u16 q3, d22, #8
3378 vld1.32 {d20}, [TMP1], STRIDE
3379 vmlsl.u16 q3, d22, d31
3380 vmlal.u16 q3, d23, d31
3381 vld1.32 {d21}, [TMP1]
3382 vmull.u8 q8, d20, d28
3383 vmlal.u8 q8, d21, d29
3384 vshrn.u32 d0, q0, #16
3385 vshrn.u32 d1, q1, #16
3386 vshrn.u32 d4, q2, #16
3387 vld1.32 {d22}, [TMP2], STRIDE
3388 vshrn.u32 d5, q3, #16
3389 vadd.u16 q12, q12, q13
3390 vld1.32 {d23}, [TMP2]
3391 vmull.u8 q9, d22, d28
3392 mov TMP3, X, asr #16
3394 add TMP3, TOP, TMP3, asl #2
3395 mov TMP4, X, asr #16
3397 add TMP4, TOP, TMP4, asl #2
3398 vmlal.u8 q9, d23, d29
3399 vld1.32 {d22}, [TMP3], STRIDE
3400 vshr.u16 q15, q12, #8
3401 vld1.32 {d23}, [TMP3]
3402 vmull.u8 q10, d22, d28
3403 vmlal.u8 q10, d23, d29
3405 vshll.u16 q0, d16, #8
3407 vmlsl.u16 q0, d16, d30
3408 vmlal.u16 q0, d17, d30
3410 vld1.32 {d16}, [TMP4], STRIDE
3411 vadd.u16 q12, q12, q13
3412 vld1.32 {d17}, [TMP4]
3414 vmull.u8 q11, d16, d28
3415 vmlal.u8 q11, d17, d29
3417 vshll.u16 q1, d18, #8
3418 vmlsl.u16 q1, d18, d31
3420 mov TMP1, X, asr #16
3422 add TMP1, TOP, TMP1, asl #2
3423 mov TMP2, X, asr #16
3425 add TMP2, TOP, TMP2, asl #2
3426 vmlal.u16 q1, d19, d31
3428 vshr.u16 q15, q12, #8
3429 vshll.u16 q2, d20, #8
3431 vmlsl.u16 q2, d20, d30
3432 vmlal.u16 q2, d21, d30
3433 vshll.u16 q3, d22, #8
3434 vld1.32 {d20}, [TMP1], STRIDE
3435 vmlsl.u16 q3, d22, d31
3436 vmlal.u16 q3, d23, d31
3437 vld1.32 {d21}, [TMP1]
3438 vmull.u8 q8, d20, d28
3439 vmlal.u8 q8, d21, d29
3441 vshll.u8 q5, d10, #8
3443 vshrn.u32 d0, q0, #16
3445 vshrn.u32 d1, q1, #16
3446 vsri.u16 q5, q7, #11
3447 vshrn.u32 d4, q2, #16
3448 vld1.32 {d22}, [TMP2], STRIDE
3449 vshrn.u32 d5, q3, #16
3450 vadd.u16 q12, q12, q13
3451 vld1.32 {d23}, [TMP2]
3452 vmull.u8 q9, d22, d28
3453 mov TMP3, X, asr #16
3455 add TMP3, TOP, TMP3, asl #2
3456 mov TMP4, X, asr #16
3458 add TMP4, TOP, TMP4, asl #2
3459 vmlal.u8 q9, d23, d29
3460 vld1.32 {d22}, [TMP3], STRIDE
3461 vshr.u16 q15, q12, #8
3462 vld1.32 {d23}, [TMP3]
3463 vmull.u8 q10, d22, d28
3464 vmlal.u8 q10, d23, d29
3466 vshll.u16 q0, d16, #8
3468 vmlsl.u16 q0, d16, d30
3469 vmlal.u16 q0, d17, d30
3471 vld1.32 {d16}, [TMP4], STRIDE
3472 vadd.u16 q12, q12, q13
3473 vld1.32 {d17}, [TMP4]
3475 vmull.u8 q11, d16, d28
3476 vmlal.u8 q11, d17, d29
3477 vshll.u16 q1, d18, #8
3478 vst1.32 {d10, d11}, [OUT, :128]!
3479 vmlsl.u16 q1, d18, d31
3481 /*****************************************************************************/
3483 generate_bilinear_scanline_func \
3484 pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
3485 2, 2, 28, BILINEAR_FLAG_UNROLL_4
3487 generate_bilinear_scanline_func \
3488 pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
3489 2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
3491 generate_bilinear_scanline_func \
3492 pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
3493 1, 2, 28, BILINEAR_FLAG_UNROLL_4
3495 generate_bilinear_scanline_func \
3496 pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
3497 1, 1, 28, BILINEAR_FLAG_UNROLL_4