2 * Copyright © 2009 Nokia Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
27 * This file contains implementations of NEON optimized pixel processing
28 * functions. There is no full and detailed tutorial, but some functions
29 * (those which are exposing some new or interesting features) are
30 * extensively commented and can be used as examples.
32 * You may want to have a look at the comments for following functions:
33 * - pixman_composite_over_8888_0565_asm_neon
34 * - pixman_composite_over_n_8_0565_asm_neon
37 /* Prevent the stack from becoming executable for no reason... */
38 #if defined(__linux__) && defined(__ELF__)
39 .section .note.GNU-stack,"",%progbits
46 .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
47 .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
52 #include "pixman-arm-neon-asm.h"
54 /* Global configuration options and preferences */
57 * The code can optionally make use of unaligned memory accesses to improve
58 * performance of handling leading/trailing pixels for each scanline.
59 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
60 * example in linux if unaligned memory accesses are not configured to
61 * generate.exceptions.
63 .set RESPECT_STRICT_ALIGNMENT, 1
66 * Set default prefetch type. There is a choice between the following options:
68 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
69 * as NOP to workaround some HW bugs or for whatever other reason)
71 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
72 * advanced prefetch intruduces heavy overhead)
74 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
75 * which can run ARM and NEON instructions simultaneously so that extra ARM
76 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
78 * Note: some types of function can't support advanced prefetch and fallback
79 * to simple one (those which handle 24bpp pixels)
81 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
83 /* Prefetch distance in pixels for simple prefetch */
84 .set PREFETCH_DISTANCE_SIMPLE, 64
87 * Implementation of pixman_composite_over_8888_0565_asm_neon
89 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
90 * performs OVER compositing operation. Function fast_composite_over_8888_0565
91 * from pixman-fast-path.c does the same in C and can be used as a reference.
93 * First we need to have some NEON assembly code which can do the actual
94 * operation on the pixels and provide it to the template macro.
96 * Template macro quite conveniently takes care of emitting all the necessary
97 * code for memory reading and writing (including quite tricky cases of
98 * handling unaligned leading/trailing pixels), so we only need to deal with
99 * the data in NEON registers.
101 * NEON registers allocation in general is recommented to be the following:
102 * d0, d1, d2, d3 - contain loaded source pixel data
103 * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed)
104 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
105 * d28, d29, d30, d31 - place for storing the result (destination pixels)
107 * As can be seen above, four 64-bit NEON registers are used for keeping
108 * intermediate pixel data and up to 8 pixels can be processed in one step
109 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
111 * This particular function uses the following registers allocation:
112 * d0, d1, d2, d3 - contain loaded source pixel data
113 * d4, d5 - contain loaded destination pixels (they are needed)
114 * d28, d29 - place for storing the result (destination pixels)
118 * Step one. We need to have some code to do some arithmetics on pixel data.
119 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
120 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
121 * perform all the needed calculations and write the result to {d28, d29}.
122 * The rationale for having two macros and not just one will be explained
123 * later. In practice, any single monolitic function which does the work can
124 * be split into two parts in any arbitrary way without affecting correctness.
126 * There is one special trick here too. Common template macro can optionally
127 * make our life a bit easier by doing R, G, B, A color components
128 * deinterleaving for 32bpp pixel formats (and this feature is used in
129 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
130 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
131 * actually use d0 register for blue channel (a vector of eight 8-bit
132 * values), d1 register for green, d2 for red and d3 for alpha. This
133 * simple conversion can be also done with a few NEON instructions:
135 * Packed to planar conversion:
141 * Planar to packed conversion:
147 * But pixel can be loaded directly in planar format using VLD4.8 NEON
148 * instruction. It is 1 cycle slower than VLD1.32, so this is not always
149 * desirable, that's why deinterleaving is optional.
151 * But anyway, here is the code:
153 .macro pixman_composite_over_8888_0565_process_pixblock_head
154 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
155 and put data into d6 - red, d7 - green, d30 - blue */
160 vmvn.8 d3, d3 /* invert source alpha */
162 vshrn.u16 d30, q2, #2
163 /* now do alpha blending, storing results in 8-bit planar format
164 into d16 - red, d19 - green, d18 - blue */
167 vmull.u8 q12, d3, d30
168 vrshr.u16 q13, q10, #8
169 vrshr.u16 q3, q11, #8
170 vrshr.u16 q15, q12, #8
171 vraddhn.u16 d20, q10, q13
172 vraddhn.u16 d23, q11, q3
173 vraddhn.u16 d22, q12, q15
176 .macro pixman_composite_over_8888_0565_process_pixblock_tail
177 /* ... continue alpha blending */
178 vqadd.u8 d16, d2, d20
180 /* convert the result to r5g6b5 and store it into {d28, d29} */
181 vshll.u8 q14, d16, #8
185 vsri.u16 q14, q9, #11
189 * OK, now we got almost everything that we need. Using the above two
190 * macros, the work can be done right. But now we want to optimize
191 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
192 * a lot from good code scheduling and software pipelining.
194 * Let's construct some code, which will run in the core main loop.
195 * Some pseudo-code of the main loop will look like this:
203 * It may look a bit weird, but this setup allows to hide instruction
204 * latencies better and also utilize dual-issue capability more
205 * efficiently (make pairs of load-store and ALU instructions).
207 * So what we need now is a '*_tail_head' macro, which will be used
208 * in the core main loop. A trivial straightforward implementation
209 * of this macro would look like this:
211 * pixman_composite_over_8888_0565_process_pixblock_tail
212 * vst1.16 {d28, d29}, [DST_W, :128]!
213 * vld1.16 {d4, d5}, [DST_R, :128]!
214 * vld4.32 {d0, d1, d2, d3}, [SRC]!
215 * pixman_composite_over_8888_0565_process_pixblock_head
218 * Now it also got some VLD/VST instructions. We simply can't move from
219 * processing one block of pixels to the other one with just arithmetics.
220 * The previously processed data needs to be written to memory and new
221 * data needs to be fetched. Fortunately, this main loop does not deal
222 * with partial leading/trailing pixels and can load/store a full block
223 * of pixels in a bulk. Additionally, destination buffer is already
224 * 16 bytes aligned here (which is good for performance).
226 * New things here are DST_R, DST_W, SRC and MASK identifiers. These
227 * are the aliases for ARM registers which are used as pointers for
228 * accessing data. We maintain separate pointers for reading and writing
229 * destination buffer (DST_R and DST_W).
231 * Another new thing is 'cache_preload' macro. It is used for prefetching
232 * data into CPU L2 cache and improve performance when dealing with large
233 * images which are far larger than cache size. It uses one argument
234 * (actually two, but they need to be the same here) - number of pixels
235 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
236 * details about this macro. Moreover, if good performance is needed
237 * the code from this macro needs to be copied into '*_tail_head' macro
238 * and mixed with the rest of code for optimal instructions scheduling.
239 * We are actually doing it below.
241 * Now after all the explanations, here is the optimized code.
242 * Different instruction streams (originaling from '*_head', '*_tail'
243 * and 'cache_preload' macro) use different indentation levels for
244 * better readability. Actually taking the code from one of these
245 * indentation levels and ignoring a few VLD/VST instructions would
246 * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
252 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
253 vqadd.u8 d16, d2, d20
254 vld1.16 {d4, d5}, [DST_R, :128]!
260 vshll.u8 q14, d16, #8
261 PF add PF_X, PF_X, #8
265 PF addne PF_X, PF_X, #8
267 PF subne PF_CTL, PF_CTL, #1
269 vshrn.u16 d30, q2, #2
271 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
273 vmull.u8 q12, d3, d30
274 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
278 vrshr.u16 q13, q10, #8
279 PF subge PF_X, PF_X, ORIG_W
280 vrshr.u16 q3, q11, #8
281 vrshr.u16 q15, q12, #8
282 PF subges PF_CTL, PF_CTL, #0x10
283 vsri.u16 q14, q9, #11
284 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
285 vraddhn.u16 d20, q10, q13
286 vraddhn.u16 d23, q11, q3
287 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
288 vraddhn.u16 d22, q12, q15
289 vst1.16 {d28, d29}, [DST_W, :128]!
294 /* If we did not care much about the performance, we would just use this... */
295 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
296 pixman_composite_over_8888_0565_process_pixblock_tail
297 vst1.16 {d28, d29}, [DST_W, :128]!
298 vld1.16 {d4, d5}, [DST_R, :128]!
300 pixman_composite_over_8888_0565_process_pixblock_head
307 * And now the final part. We are using 'generate_composite_function' macro
308 * to put all the stuff together. We are specifying the name of the function
309 * which we want to get, number of bits per pixel for the source, mask and
310 * destination (0 if unused, like mask in this case). Next come some bit
312 * FLAG_DST_READWRITE - tells that the destination buffer is both read
313 * and written, for write-only buffer we would use
314 * FLAG_DST_WRITEONLY flag instead
315 * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
316 * and separate color channels for 32bpp format.
317 * The next things are:
318 * - the number of pixels processed per iteration (8 in this case, because
319 * that's the maximum what can fit into four 64-bit NEON registers).
320 * - prefetch distance, measured in pixel blocks. In this case it is 5 times
321 * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
322 * prefetch distance can be selected by running some benchmarks.
324 * After that we specify some macros, these are 'default_init',
325 * 'default_cleanup' here which are empty (but it is possible to have custom
326 * init/cleanup macros to be able to save/restore some extra NEON registers
327 * like d8-d15 or do anything else) followed by
328 * 'pixman_composite_over_8888_0565_process_pixblock_head',
329 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
330 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
331 * which we got implemented above.
333 * The last part is the NEON registers allocation scheme.
335 generate_composite_function \
336 pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
337 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
338 8, /* number of pixels, processed in a single block */ \
339 5, /* prefetch distance */ \
342 pixman_composite_over_8888_0565_process_pixblock_head, \
343 pixman_composite_over_8888_0565_process_pixblock_tail, \
344 pixman_composite_over_8888_0565_process_pixblock_tail_head, \
345 28, /* dst_w_basereg */ \
346 4, /* dst_r_basereg */ \
347 0, /* src_basereg */ \
348 24 /* mask_basereg */
350 /******************************************************************************/
352 .macro pixman_composite_over_n_0565_process_pixblock_head
353 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
354 and put data into d6 - red, d7 - green, d30 - blue */
360 vshrn.u16 d30, q2, #2
361 /* now do alpha blending, storing results in 8-bit planar format
362 into d16 - red, d19 - green, d18 - blue */
365 vmull.u8 q12, d3, d30
366 vrshr.u16 q13, q10, #8
367 vrshr.u16 q3, q11, #8
368 vrshr.u16 q15, q12, #8
369 vraddhn.u16 d20, q10, q13
370 vraddhn.u16 d23, q11, q3
371 vraddhn.u16 d22, q12, q15
374 .macro pixman_composite_over_n_0565_process_pixblock_tail
375 /* ... continue alpha blending */
376 vqadd.u8 d16, d2, d20
378 /* convert the result to r5g6b5 and store it into {d28, d29} */
379 vshll.u8 q14, d16, #8
383 vsri.u16 q14, q9, #11
386 /* TODO: expand macros and do better instructions scheduling */
387 .macro pixman_composite_over_n_0565_process_pixblock_tail_head
388 pixman_composite_over_n_0565_process_pixblock_tail
389 vld1.16 {d4, d5}, [DST_R, :128]!
390 vst1.16 {d28, d29}, [DST_W, :128]!
391 pixman_composite_over_n_0565_process_pixblock_head
395 .macro pixman_composite_over_n_0565_init
396 add DUMMY, sp, #ARGS_STACK_OFFSET
397 vld1.32 {d3[0]}, [DUMMY]
402 vmvn.8 d3, d3 /* invert source alpha */
405 generate_composite_function \
406 pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
407 FLAG_DST_READWRITE, \
408 8, /* number of pixels, processed in a single block */ \
409 5, /* prefetch distance */ \
410 pixman_composite_over_n_0565_init, \
412 pixman_composite_over_n_0565_process_pixblock_head, \
413 pixman_composite_over_n_0565_process_pixblock_tail, \
414 pixman_composite_over_n_0565_process_pixblock_tail_head, \
415 28, /* dst_w_basereg */ \
416 4, /* dst_r_basereg */ \
417 0, /* src_basereg */ \
418 24 /* mask_basereg */
420 /******************************************************************************/
422 .macro pixman_composite_src_8888_0565_process_pixblock_head
428 .macro pixman_composite_src_8888_0565_process_pixblock_tail
430 vsri.u16 q14, q9, #11
433 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
435 PF add PF_X, PF_X, #8
438 PF addne PF_X, PF_X, #8
439 PF subne PF_CTL, PF_CTL, #1
440 vsri.u16 q14, q9, #11
442 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
444 vst1.16 {d28, d29}, [DST_W, :128]!
445 PF subge PF_X, PF_X, ORIG_W
446 PF subges PF_CTL, PF_CTL, #0x10
448 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
452 generate_composite_function \
453 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
454 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
455 8, /* number of pixels, processed in a single block */ \
456 10, /* prefetch distance */ \
459 pixman_composite_src_8888_0565_process_pixblock_head, \
460 pixman_composite_src_8888_0565_process_pixblock_tail, \
461 pixman_composite_src_8888_0565_process_pixblock_tail_head
463 /******************************************************************************/
465 .macro pixman_composite_src_0565_8888_process_pixblock_head
466 vshrn.u16 d30, q0, #8
467 vshrn.u16 d29, q0, #3
472 vshrn.u16 d28, q0, #2
475 .macro pixman_composite_src_0565_8888_process_pixblock_tail
478 /* TODO: expand macros and do better instructions scheduling */
479 .macro pixman_composite_src_0565_8888_process_pixblock_tail_head
480 pixman_composite_src_0565_8888_process_pixblock_tail
481 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
483 pixman_composite_src_0565_8888_process_pixblock_head
487 generate_composite_function \
488 pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
489 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
490 8, /* number of pixels, processed in a single block */ \
491 10, /* prefetch distance */ \
494 pixman_composite_src_0565_8888_process_pixblock_head, \
495 pixman_composite_src_0565_8888_process_pixblock_tail, \
496 pixman_composite_src_0565_8888_process_pixblock_tail_head
498 /******************************************************************************/
500 .macro pixman_composite_add_8_8_process_pixblock_head
505 .macro pixman_composite_add_8_8_process_pixblock_tail
508 .macro pixman_composite_add_8_8_process_pixblock_tail_head
510 PF add PF_X, PF_X, #32
512 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
513 PF addne PF_X, PF_X, #32
514 PF subne PF_CTL, PF_CTL, #1
515 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
517 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
518 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
519 PF subge PF_X, PF_X, ORIG_W
520 PF subges PF_CTL, PF_CTL, #0x10
522 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
523 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
527 generate_composite_function \
528 pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
529 FLAG_DST_READWRITE, \
530 32, /* number of pixels, processed in a single block */ \
531 10, /* prefetch distance */ \
534 pixman_composite_add_8_8_process_pixblock_head, \
535 pixman_composite_add_8_8_process_pixblock_tail, \
536 pixman_composite_add_8_8_process_pixblock_tail_head
538 /******************************************************************************/
540 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
542 PF add PF_X, PF_X, #8
544 vld1.32 {d4, d5, d6, d7}, [DST_R, :128]!
545 PF addne PF_X, PF_X, #8
546 PF subne PF_CTL, PF_CTL, #1
547 vst1.32 {d28, d29, d30, d31}, [DST_W, :128]!
549 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
550 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
551 PF subge PF_X, PF_X, ORIG_W
552 PF subges PF_CTL, PF_CTL, #0x10
554 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
555 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
559 generate_composite_function \
560 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
561 FLAG_DST_READWRITE, \
562 8, /* number of pixels, processed in a single block */ \
563 10, /* prefetch distance */ \
566 pixman_composite_add_8_8_process_pixblock_head, \
567 pixman_composite_add_8_8_process_pixblock_tail, \
568 pixman_composite_add_8888_8888_process_pixblock_tail_head
570 generate_composite_function_single_scanline \
571 pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
572 FLAG_DST_READWRITE, \
573 8, /* number of pixels, processed in a single block */ \
576 pixman_composite_add_8_8_process_pixblock_head, \
577 pixman_composite_add_8_8_process_pixblock_tail, \
578 pixman_composite_add_8888_8888_process_pixblock_tail_head
580 /******************************************************************************/
582 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
583 vmvn.8 d24, d3 /* get inverted alpha */
584 /* do alpha blending */
587 vmull.u8 q10, d24, d6
588 vmull.u8 q11, d24, d7
591 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
592 vrshr.u16 q14, q8, #8
593 vrshr.u16 q15, q9, #8
594 vrshr.u16 q12, q10, #8
595 vrshr.u16 q13, q11, #8
596 vraddhn.u16 d28, q14, q8
597 vraddhn.u16 d29, q15, q9
598 vraddhn.u16 d30, q12, q10
599 vraddhn.u16 d31, q13, q11
602 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
603 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
604 vrshr.u16 q14, q8, #8
605 PF add PF_X, PF_X, #8
607 vrshr.u16 q15, q9, #8
608 vrshr.u16 q12, q10, #8
609 vrshr.u16 q13, q11, #8
610 PF addne PF_X, PF_X, #8
611 PF subne PF_CTL, PF_CTL, #1
612 vraddhn.u16 d28, q14, q8
613 vraddhn.u16 d29, q15, q9
615 vraddhn.u16 d30, q12, q10
616 vraddhn.u16 d31, q13, q11
618 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
620 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
621 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
622 PF subge PF_X, PF_X, ORIG_W
624 PF subges PF_CTL, PF_CTL, #0x10
626 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
627 vmull.u8 q10, d22, d6
628 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
629 vmull.u8 q11, d22, d7
632 generate_composite_function_single_scanline \
633 pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
634 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
635 8, /* number of pixels, processed in a single block */ \
638 pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
639 pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
640 pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
642 /******************************************************************************/
644 .macro pixman_composite_over_8888_8888_process_pixblock_head
645 pixman_composite_out_reverse_8888_8888_process_pixblock_head
648 .macro pixman_composite_over_8888_8888_process_pixblock_tail
649 pixman_composite_out_reverse_8888_8888_process_pixblock_tail
650 vqadd.u8 q14, q0, q14
651 vqadd.u8 q15, q1, q15
654 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
655 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
656 vrshr.u16 q14, q8, #8
657 PF add PF_X, PF_X, #8
659 vrshr.u16 q15, q9, #8
660 vrshr.u16 q12, q10, #8
661 vrshr.u16 q13, q11, #8
662 PF addne PF_X, PF_X, #8
663 PF subne PF_CTL, PF_CTL, #1
664 vraddhn.u16 d28, q14, q8
665 vraddhn.u16 d29, q15, q9
667 vraddhn.u16 d30, q12, q10
668 vraddhn.u16 d31, q13, q11
669 vqadd.u8 q14, q0, q14
670 vqadd.u8 q15, q1, q15
672 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
674 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
675 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
676 PF subge PF_X, PF_X, ORIG_W
678 PF subges PF_CTL, PF_CTL, #0x10
680 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
681 vmull.u8 q10, d22, d6
682 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
683 vmull.u8 q11, d22, d7
686 generate_composite_function \
687 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
688 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
689 8, /* number of pixels, processed in a single block */ \
690 5, /* prefetch distance */ \
693 pixman_composite_over_8888_8888_process_pixblock_head, \
694 pixman_composite_over_8888_8888_process_pixblock_tail, \
695 pixman_composite_over_8888_8888_process_pixblock_tail_head
697 generate_composite_function_single_scanline \
698 pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
699 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
700 8, /* number of pixels, processed in a single block */ \
703 pixman_composite_over_8888_8888_process_pixblock_head, \
704 pixman_composite_over_8888_8888_process_pixblock_tail, \
705 pixman_composite_over_8888_8888_process_pixblock_tail_head
707 /******************************************************************************/
709 /* TODO: expand macros and do better instructions scheduling */
710 .macro pixman_composite_over_n_8888_process_pixblock_tail_head
711 pixman_composite_over_8888_8888_process_pixblock_tail
712 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
713 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
714 pixman_composite_over_8888_8888_process_pixblock_head
718 .macro pixman_composite_over_n_8888_init
719 add DUMMY, sp, #ARGS_STACK_OFFSET
720 vld1.32 {d3[0]}, [DUMMY]
727 generate_composite_function \
728 pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
729 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
730 8, /* number of pixels, processed in a single block */ \
731 5, /* prefetch distance */ \
732 pixman_composite_over_n_8888_init, \
734 pixman_composite_over_8888_8888_process_pixblock_head, \
735 pixman_composite_over_8888_8888_process_pixblock_tail, \
736 pixman_composite_over_n_8888_process_pixblock_tail_head
738 /******************************************************************************/
740 .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
741 vrshr.u16 q14, q8, #8
742 PF add PF_X, PF_X, #8
744 vrshr.u16 q15, q9, #8
745 vrshr.u16 q12, q10, #8
746 vrshr.u16 q13, q11, #8
747 PF addne PF_X, PF_X, #8
748 PF subne PF_CTL, PF_CTL, #1
749 vraddhn.u16 d28, q14, q8
750 vraddhn.u16 d29, q15, q9
752 vraddhn.u16 d30, q12, q10
753 vraddhn.u16 d31, q13, q11
754 vqadd.u8 q14, q0, q14
755 vqadd.u8 q15, q1, q15
756 vld4.8 {d0, d1, d2, d3}, [DST_R, :128]!
758 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
759 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
760 PF subge PF_X, PF_X, ORIG_W
762 PF subges PF_CTL, PF_CTL, #0x10
764 vmull.u8 q10, d22, d6
765 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
766 vmull.u8 q11, d22, d7
769 .macro pixman_composite_over_reverse_n_8888_init
770 add DUMMY, sp, #ARGS_STACK_OFFSET
771 vld1.32 {d7[0]}, [DUMMY]
778 generate_composite_function \
779 pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
780 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
781 8, /* number of pixels, processed in a single block */ \
782 5, /* prefetch distance */ \
783 pixman_composite_over_reverse_n_8888_init, \
785 pixman_composite_over_8888_8888_process_pixblock_head, \
786 pixman_composite_over_8888_8888_process_pixblock_tail, \
787 pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
788 28, /* dst_w_basereg */ \
789 0, /* dst_r_basereg */ \
790 4, /* src_basereg */ \
791 24 /* mask_basereg */
793 /******************************************************************************/
795 .macro pixman_composite_over_8888_8_0565_process_pixblock_head
796 vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */
798 vmull.u8 q6, d24, d10
799 vmull.u8 q7, d24, d11
800 vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */
803 vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */
805 vrshr.u16 q10, q6, #8
806 vrshr.u16 q11, q7, #8
807 vraddhn.u16 d0, q0, q8
808 vraddhn.u16 d1, q1, q9
809 vraddhn.u16 d2, q6, q10
810 vraddhn.u16 d3, q7, q11
811 vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */
814 vshrn.u16 d30, q2, #2
815 vmull.u8 q8, d3, d6 /* now do alpha blending */
817 vmull.u8 q10, d3, d30
820 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail
821 /* 3 cycle bubble (after vmull.u8) */
822 vrshr.u16 q13, q8, #8
823 vrshr.u16 q11, q9, #8
824 vrshr.u16 q15, q10, #8
825 vraddhn.u16 d16, q8, q13
826 vraddhn.u16 d27, q9, q11
827 vraddhn.u16 d26, q10, q15
828 vqadd.u8 d16, d2, d16
831 vshll.u8 q14, d16, #8 /* convert to 16bpp */
836 vsri.u16 q14, q9, #11
839 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
840 vld1.16 {d4, d5}, [DST_R, :128]!
845 vmull.u8 q6, d24, d10
846 vrshr.u16 q13, q8, #8
847 vrshr.u16 q11, q9, #8
848 vrshr.u16 q15, q10, #8
849 vraddhn.u16 d16, q8, q13
850 vraddhn.u16 d27, q9, q11
851 vraddhn.u16 d26, q10, q15
852 vqadd.u8 d16, d2, d16
855 vshll.u8 q14, d16, #8
860 vmull.u8 q7, d24, d11
861 vsri.u16 q14, q9, #11
868 vrshr.u16 q10, q6, #8
869 vrshr.u16 q11, q7, #8
870 vraddhn.u16 d0, q0, q8
871 vraddhn.u16 d1, q1, q9
872 vraddhn.u16 d2, q6, q10
873 vraddhn.u16 d3, q7, q11
877 vshrn.u16 d30, q2, #2
878 vst1.16 {d28, d29}, [DST_W, :128]!
881 vmull.u8 q10, d3, d30
884 generate_composite_function \
885 pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
886 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
887 8, /* number of pixels, processed in a single block */ \
888 5, /* prefetch distance */ \
889 default_init_need_all_regs, \
890 default_cleanup_need_all_regs, \
891 pixman_composite_over_8888_8_0565_process_pixblock_head, \
892 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
893 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
894 28, /* dst_w_basereg */ \
895 4, /* dst_r_basereg */ \
896 8, /* src_basereg */ \
897 24 /* mask_basereg */
899 /******************************************************************************/
902 * This function needs a special initialization of solid mask.
903 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
904 * offset, split into color components and replicated in d8-d11
905 * registers. Additionally, this function needs all the NEON registers,
906 * so it has to save d8-d15 registers which are callee saved according
907 * to ABI. These registers are restored from 'cleanup' macro. All the
908 * other NEON registers are caller saved, so can be clobbered freely
909 * without introducing any problems.
911 .macro pixman_composite_over_n_8_0565_init
912 add DUMMY, sp, #ARGS_STACK_OFFSET
914 vld1.32 {d11[0]}, [DUMMY]
921 .macro pixman_composite_over_n_8_0565_cleanup
925 generate_composite_function \
926 pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
927 FLAG_DST_READWRITE, \
928 8, /* number of pixels, processed in a single block */ \
929 5, /* prefetch distance */ \
930 pixman_composite_over_n_8_0565_init, \
931 pixman_composite_over_n_8_0565_cleanup, \
932 pixman_composite_over_8888_8_0565_process_pixblock_head, \
933 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
934 pixman_composite_over_8888_8_0565_process_pixblock_tail_head
936 /******************************************************************************/
938 .macro pixman_composite_over_8888_n_0565_init
939 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
941 vld1.32 {d24[0]}, [DUMMY]
945 .macro pixman_composite_over_8888_n_0565_cleanup
949 generate_composite_function \
950 pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
951 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
952 8, /* number of pixels, processed in a single block */ \
953 5, /* prefetch distance */ \
954 pixman_composite_over_8888_n_0565_init, \
955 pixman_composite_over_8888_n_0565_cleanup, \
956 pixman_composite_over_8888_8_0565_process_pixblock_head, \
957 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
958 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
959 28, /* dst_w_basereg */ \
960 4, /* dst_r_basereg */ \
961 8, /* src_basereg */ \
962 24 /* mask_basereg */
964 /******************************************************************************/
966 .macro pixman_composite_src_0565_0565_process_pixblock_head
969 .macro pixman_composite_src_0565_0565_process_pixblock_tail
972 .macro pixman_composite_src_0565_0565_process_pixblock_tail_head
973 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
978 generate_composite_function \
979 pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
980 FLAG_DST_WRITEONLY, \
981 16, /* number of pixels, processed in a single block */ \
982 10, /* prefetch distance */ \
985 pixman_composite_src_0565_0565_process_pixblock_head, \
986 pixman_composite_src_0565_0565_process_pixblock_tail, \
987 pixman_composite_src_0565_0565_process_pixblock_tail_head, \
988 0, /* dst_w_basereg */ \
989 0, /* dst_r_basereg */ \
990 0, /* src_basereg */ \
993 /******************************************************************************/
995 .macro pixman_composite_src_n_8_process_pixblock_head
998 .macro pixman_composite_src_n_8_process_pixblock_tail
1001 .macro pixman_composite_src_n_8_process_pixblock_tail_head
1002 vst1.8 {d0, d1, d2, d3}, [DST_W, :128]!
1005 .macro pixman_composite_src_n_8_init
1006 add DUMMY, sp, #ARGS_STACK_OFFSET
1007 vld1.32 {d0[0]}, [DUMMY]
1009 vsli.u64 d0, d0, #16
1010 vsli.u64 d0, d0, #32
1015 .macro pixman_composite_src_n_8_cleanup
1018 generate_composite_function \
1019 pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
1020 FLAG_DST_WRITEONLY, \
1021 32, /* number of pixels, processed in a single block */ \
1022 0, /* prefetch distance */ \
1023 pixman_composite_src_n_8_init, \
1024 pixman_composite_src_n_8_cleanup, \
1025 pixman_composite_src_n_8_process_pixblock_head, \
1026 pixman_composite_src_n_8_process_pixblock_tail, \
1027 pixman_composite_src_n_8_process_pixblock_tail_head, \
1028 0, /* dst_w_basereg */ \
1029 0, /* dst_r_basereg */ \
1030 0, /* src_basereg */ \
1031 0 /* mask_basereg */
1033 /******************************************************************************/
1035 .macro pixman_composite_src_n_0565_process_pixblock_head
1038 .macro pixman_composite_src_n_0565_process_pixblock_tail
1041 .macro pixman_composite_src_n_0565_process_pixblock_tail_head
1042 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
1045 .macro pixman_composite_src_n_0565_init
1046 add DUMMY, sp, #ARGS_STACK_OFFSET
1047 vld1.32 {d0[0]}, [DUMMY]
1048 vsli.u64 d0, d0, #16
1049 vsli.u64 d0, d0, #32
1054 .macro pixman_composite_src_n_0565_cleanup
1057 generate_composite_function \
1058 pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
1059 FLAG_DST_WRITEONLY, \
1060 16, /* number of pixels, processed in a single block */ \
1061 0, /* prefetch distance */ \
1062 pixman_composite_src_n_0565_init, \
1063 pixman_composite_src_n_0565_cleanup, \
1064 pixman_composite_src_n_0565_process_pixblock_head, \
1065 pixman_composite_src_n_0565_process_pixblock_tail, \
1066 pixman_composite_src_n_0565_process_pixblock_tail_head, \
1067 0, /* dst_w_basereg */ \
1068 0, /* dst_r_basereg */ \
1069 0, /* src_basereg */ \
1070 0 /* mask_basereg */
1072 /******************************************************************************/
1074 .macro pixman_composite_src_n_8888_process_pixblock_head
1077 .macro pixman_composite_src_n_8888_process_pixblock_tail
1080 .macro pixman_composite_src_n_8888_process_pixblock_tail_head
1081 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1084 .macro pixman_composite_src_n_8888_init
1085 add DUMMY, sp, #ARGS_STACK_OFFSET
1086 vld1.32 {d0[0]}, [DUMMY]
1087 vsli.u64 d0, d0, #32
1092 .macro pixman_composite_src_n_8888_cleanup
1095 generate_composite_function \
1096 pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
1097 FLAG_DST_WRITEONLY, \
1098 8, /* number of pixels, processed in a single block */ \
1099 0, /* prefetch distance */ \
1100 pixman_composite_src_n_8888_init, \
1101 pixman_composite_src_n_8888_cleanup, \
1102 pixman_composite_src_n_8888_process_pixblock_head, \
1103 pixman_composite_src_n_8888_process_pixblock_tail, \
1104 pixman_composite_src_n_8888_process_pixblock_tail_head, \
1105 0, /* dst_w_basereg */ \
1106 0, /* dst_r_basereg */ \
1107 0, /* src_basereg */ \
1108 0 /* mask_basereg */
1110 /******************************************************************************/
1112 .macro pixman_composite_src_8888_8888_process_pixblock_head
1115 .macro pixman_composite_src_8888_8888_process_pixblock_tail
1118 .macro pixman_composite_src_8888_8888_process_pixblock_tail_head
1119 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1124 generate_composite_function \
1125 pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
1126 FLAG_DST_WRITEONLY, \
1127 8, /* number of pixels, processed in a single block */ \
1128 10, /* prefetch distance */ \
1131 pixman_composite_src_8888_8888_process_pixblock_head, \
1132 pixman_composite_src_8888_8888_process_pixblock_tail, \
1133 pixman_composite_src_8888_8888_process_pixblock_tail_head, \
1134 0, /* dst_w_basereg */ \
1135 0, /* dst_r_basereg */ \
1136 0, /* src_basereg */ \
1137 0 /* mask_basereg */
1139 /******************************************************************************/
1141 .macro pixman_composite_src_x888_8888_process_pixblock_head
1146 .macro pixman_composite_src_x888_8888_process_pixblock_tail
1149 .macro pixman_composite_src_x888_8888_process_pixblock_tail_head
1150 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1157 .macro pixman_composite_src_x888_8888_init
1159 vshl.u32 q2, q2, #24
1162 generate_composite_function \
1163 pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
1164 FLAG_DST_WRITEONLY, \
1165 8, /* number of pixels, processed in a single block */ \
1166 10, /* prefetch distance */ \
1167 pixman_composite_src_x888_8888_init, \
1169 pixman_composite_src_x888_8888_process_pixblock_head, \
1170 pixman_composite_src_x888_8888_process_pixblock_tail, \
1171 pixman_composite_src_x888_8888_process_pixblock_tail_head, \
1172 0, /* dst_w_basereg */ \
1173 0, /* dst_r_basereg */ \
1174 0, /* src_basereg */ \
1175 0 /* mask_basereg */
1177 /******************************************************************************/
1179 .macro pixman_composite_over_n_8_8888_process_pixblock_head
1180 /* expecting deinterleaved source data in {d8, d9, d10, d11} */
1181 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1182 /* and destination data in {d4, d5, d6, d7} */
1183 /* mask is in d24 (d25, d26, d27 are unused) */
1186 vmull.u8 q0, d24, d8
1187 vmull.u8 q1, d24, d9
1188 vmull.u8 q6, d24, d10
1189 vmull.u8 q7, d24, d11
1190 vrshr.u16 q10, q0, #8
1191 vrshr.u16 q11, q1, #8
1192 vrshr.u16 q12, q6, #8
1193 vrshr.u16 q13, q7, #8
1194 vraddhn.u16 d0, q0, q10
1195 vraddhn.u16 d1, q1, q11
1196 vraddhn.u16 d2, q6, q12
1197 vraddhn.u16 d3, q7, q13
1198 vmvn.8 d24, d3 /* get inverted alpha */
1199 /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */
1200 /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
1201 /* now do alpha blending */
1202 vmull.u8 q8, d24, d4
1203 vmull.u8 q9, d24, d5
1204 vmull.u8 q10, d24, d6
1205 vmull.u8 q11, d24, d7
1208 .macro pixman_composite_over_n_8_8888_process_pixblock_tail
1209 vrshr.u16 q14, q8, #8
1210 vrshr.u16 q15, q9, #8
1211 vrshr.u16 q12, q10, #8
1212 vrshr.u16 q13, q11, #8
1213 vraddhn.u16 d28, q14, q8
1214 vraddhn.u16 d29, q15, q9
1215 vraddhn.u16 d30, q12, q10
1216 vraddhn.u16 d31, q13, q11
1217 vqadd.u8 q14, q0, q14
1218 vqadd.u8 q15, q1, q15
1221 /* TODO: expand macros and do better instructions scheduling */
1222 .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
1223 pixman_composite_over_n_8_8888_process_pixblock_tail
1224 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1225 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1228 pixman_composite_over_n_8_8888_process_pixblock_head
1231 .macro pixman_composite_over_n_8_8888_init
1232 add DUMMY, sp, #ARGS_STACK_OFFSET
1234 vld1.32 {d11[0]}, [DUMMY]
1241 .macro pixman_composite_over_n_8_8888_cleanup
1245 generate_composite_function \
1246 pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
1247 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1248 8, /* number of pixels, processed in a single block */ \
1249 5, /* prefetch distance */ \
1250 pixman_composite_over_n_8_8888_init, \
1251 pixman_composite_over_n_8_8888_cleanup, \
1252 pixman_composite_over_n_8_8888_process_pixblock_head, \
1253 pixman_composite_over_n_8_8888_process_pixblock_tail, \
1254 pixman_composite_over_n_8_8888_process_pixblock_tail_head
1256 /******************************************************************************/
1258 .macro pixman_composite_over_n_8_8_process_pixblock_head
1259 vmull.u8 q0, d24, d8
1260 vmull.u8 q1, d25, d8
1261 vmull.u8 q6, d26, d8
1262 vmull.u8 q7, d27, d8
1263 vrshr.u16 q10, q0, #8
1264 vrshr.u16 q11, q1, #8
1265 vrshr.u16 q12, q6, #8
1266 vrshr.u16 q13, q7, #8
1267 vraddhn.u16 d0, q0, q10
1268 vraddhn.u16 d1, q1, q11
1269 vraddhn.u16 d2, q6, q12
1270 vraddhn.u16 d3, q7, q13
1273 vmull.u8 q8, d24, d4
1274 vmull.u8 q9, d25, d5
1275 vmull.u8 q10, d26, d6
1276 vmull.u8 q11, d27, d7
1279 .macro pixman_composite_over_n_8_8_process_pixblock_tail
1280 vrshr.u16 q14, q8, #8
1281 vrshr.u16 q15, q9, #8
1282 vrshr.u16 q12, q10, #8
1283 vrshr.u16 q13, q11, #8
1284 vraddhn.u16 d28, q14, q8
1285 vraddhn.u16 d29, q15, q9
1286 vraddhn.u16 d30, q12, q10
1287 vraddhn.u16 d31, q13, q11
1288 vqadd.u8 q14, q0, q14
1289 vqadd.u8 q15, q1, q15
1292 /* TODO: expand macros and do better instructions scheduling */
1293 .macro pixman_composite_over_n_8_8_process_pixblock_tail_head
1294 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1295 pixman_composite_over_n_8_8_process_pixblock_tail
1297 cache_preload 32, 32
1298 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1299 pixman_composite_over_n_8_8_process_pixblock_head
1302 .macro pixman_composite_over_n_8_8_init
1303 add DUMMY, sp, #ARGS_STACK_OFFSET
1305 vld1.32 {d8[0]}, [DUMMY]
1309 .macro pixman_composite_over_n_8_8_cleanup
1313 generate_composite_function \
1314 pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
1315 FLAG_DST_READWRITE, \
1316 32, /* number of pixels, processed in a single block */ \
1317 5, /* prefetch distance */ \
1318 pixman_composite_over_n_8_8_init, \
1319 pixman_composite_over_n_8_8_cleanup, \
1320 pixman_composite_over_n_8_8_process_pixblock_head, \
1321 pixman_composite_over_n_8_8_process_pixblock_tail, \
1322 pixman_composite_over_n_8_8_process_pixblock_tail_head
1324 /******************************************************************************/
1326 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1328 * 'combine_mask_ca' replacement
1330 * input: solid src (n) in {d8, d9, d10, d11}
1331 * dest in {d4, d5, d6, d7 }
1332 * mask in {d24, d25, d26, d27}
1333 * output: updated src in {d0, d1, d2, d3 }
1334 * updated mask in {d24, d25, d26, d3 }
1336 vmull.u8 q0, d24, d8
1337 vmull.u8 q1, d25, d9
1338 vmull.u8 q6, d26, d10
1339 vmull.u8 q7, d27, d11
1340 vmull.u8 q9, d11, d25
1341 vmull.u8 q12, d11, d24
1342 vmull.u8 q13, d11, d26
1343 vrshr.u16 q8, q0, #8
1344 vrshr.u16 q10, q1, #8
1345 vrshr.u16 q11, q6, #8
1346 vraddhn.u16 d0, q0, q8
1347 vraddhn.u16 d1, q1, q10
1348 vraddhn.u16 d2, q6, q11
1349 vrshr.u16 q11, q12, #8
1350 vrshr.u16 q8, q9, #8
1351 vrshr.u16 q6, q13, #8
1352 vrshr.u16 q10, q7, #8
1353 vraddhn.u16 d24, q12, q11
1354 vraddhn.u16 d25, q9, q8
1355 vraddhn.u16 d26, q13, q6
1356 vraddhn.u16 d3, q7, q10
1358 * 'combine_over_ca' replacement
1360 * output: updated dest in {d28, d29, d30, d31}
1364 vmull.u8 q8, d24, d4
1365 vmull.u8 q9, d25, d5
1367 vmull.u8 q10, d26, d6
1368 vmull.u8 q11, d27, d7
1371 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
1372 /* ... continue 'combine_over_ca' replacement */
1373 vrshr.u16 q14, q8, #8
1374 vrshr.u16 q15, q9, #8
1375 vrshr.u16 q6, q10, #8
1376 vrshr.u16 q7, q11, #8
1377 vraddhn.u16 d28, q14, q8
1378 vraddhn.u16 d29, q15, q9
1379 vraddhn.u16 d30, q6, q10
1380 vraddhn.u16 d31, q7, q11
1381 vqadd.u8 q14, q0, q14
1382 vqadd.u8 q15, q1, q15
1385 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1386 vrshr.u16 q14, q8, #8
1387 vrshr.u16 q15, q9, #8
1388 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1389 vrshr.u16 q6, q10, #8
1390 vrshr.u16 q7, q11, #8
1391 vraddhn.u16 d28, q14, q8
1392 vraddhn.u16 d29, q15, q9
1393 vraddhn.u16 d30, q6, q10
1394 vraddhn.u16 d31, q7, q11
1396 vqadd.u8 q14, q0, q14
1397 vqadd.u8 q15, q1, q15
1399 pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1400 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1403 .macro pixman_composite_over_n_8888_8888_ca_init
1404 add DUMMY, sp, #ARGS_STACK_OFFSET
1406 vld1.32 {d11[0]}, [DUMMY]
1413 .macro pixman_composite_over_n_8888_8888_ca_cleanup
1417 generate_composite_function \
1418 pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
1419 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1420 8, /* number of pixels, processed in a single block */ \
1421 5, /* prefetch distance */ \
1422 pixman_composite_over_n_8888_8888_ca_init, \
1423 pixman_composite_over_n_8888_8888_ca_cleanup, \
1424 pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
1425 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
1426 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1428 /******************************************************************************/
1430 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
1432 * 'combine_mask_ca' replacement
1434 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A]
1435 * mask in {d24, d25, d26} [B, G, R]
1436 * output: updated src in {d0, d1, d2 } [B, G, R]
1437 * updated mask in {d24, d25, d26} [B, G, R]
1439 vmull.u8 q0, d24, d8
1440 vmull.u8 q1, d25, d9
1441 vmull.u8 q6, d26, d10
1442 vmull.u8 q9, d11, d25
1443 vmull.u8 q12, d11, d24
1444 vmull.u8 q13, d11, d26
1445 vrshr.u16 q8, q0, #8
1446 vrshr.u16 q10, q1, #8
1447 vrshr.u16 q11, q6, #8
1448 vraddhn.u16 d0, q0, q8
1449 vraddhn.u16 d1, q1, q10
1450 vraddhn.u16 d2, q6, q11
1451 vrshr.u16 q11, q12, #8
1452 vrshr.u16 q8, q9, #8
1453 vrshr.u16 q6, q13, #8
1454 vraddhn.u16 d24, q12, q11
1455 vraddhn.u16 d25, q9, q8
1457 * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
1458 * and put data into d16 - blue, d17 - green, d18 - red
1460 vshrn.u16 d17, q2, #3
1461 vshrn.u16 d18, q2, #8
1462 vraddhn.u16 d26, q13, q6
1464 vsri.u8 d18, d18, #5
1465 vsri.u8 d17, d17, #6
1467 * 'combine_over_ca' replacement
1469 * output: updated dest in d16 - blue, d17 - green, d18 - red
1472 vshrn.u16 d16, q2, #2
1474 vmull.u8 q6, d16, d24
1475 vmull.u8 q7, d17, d25
1476 vmull.u8 q11, d18, d26
1479 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
1480 /* ... continue 'combine_over_ca' replacement */
1481 vrshr.u16 q10, q6, #8
1482 vrshr.u16 q14, q7, #8
1483 vrshr.u16 q15, q11, #8
1484 vraddhn.u16 d16, q10, q6
1485 vraddhn.u16 d17, q14, q7
1486 vraddhn.u16 d18, q15, q11
1488 vqadd.u8 d18, d2, d18
1490 * convert the results in d16, d17, d18 to r5g6b5 and store
1491 * them into {d28, d29}
1493 vshll.u8 q14, d18, #8
1494 vshll.u8 q10, d17, #8
1495 vshll.u8 q15, d16, #8
1496 vsri.u16 q14, q10, #5
1497 vsri.u16 q14, q15, #11
1500 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
1502 vrshr.u16 q10, q6, #8
1503 vrshr.u16 q14, q7, #8
1504 vld1.16 {d4, d5}, [DST_R, :128]!
1505 vrshr.u16 q15, q11, #8
1506 vraddhn.u16 d16, q10, q6
1507 vraddhn.u16 d17, q14, q7
1508 vraddhn.u16 d22, q15, q11
1509 /* process_pixblock_head */
1511 * 'combine_mask_ca' replacement
1513 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A]
1514 * mask in {d24, d25, d26} [B, G, R]
1515 * output: updated src in {d0, d1, d2 } [B, G, R]
1516 * updated mask in {d24, d25, d26} [B, G, R]
1518 vmull.u8 q6, d26, d10
1520 vmull.u8 q0, d24, d8
1521 vqadd.u8 d22, d2, d22
1522 vmull.u8 q1, d25, d9
1524 * convert the result in d16, d17, d22 to r5g6b5 and store
1525 * it into {d28, d29}
1527 vshll.u8 q14, d22, #8
1528 vshll.u8 q10, d17, #8
1529 vshll.u8 q15, d16, #8
1530 vmull.u8 q9, d11, d25
1531 vsri.u16 q14, q10, #5
1532 vmull.u8 q12, d11, d24
1533 vmull.u8 q13, d11, d26
1534 vsri.u16 q14, q15, #11
1536 vrshr.u16 q8, q0, #8
1537 vrshr.u16 q10, q1, #8
1538 vrshr.u16 q11, q6, #8
1539 vraddhn.u16 d0, q0, q8
1540 vraddhn.u16 d1, q1, q10
1541 vraddhn.u16 d2, q6, q11
1542 vrshr.u16 q11, q12, #8
1543 vrshr.u16 q8, q9, #8
1544 vrshr.u16 q6, q13, #8
1545 vraddhn.u16 d24, q12, q11
1546 vraddhn.u16 d25, q9, q8
1548 * convert 8 r5g6b5 pixel data from {d4, d5} to planar
1549 * 8-bit format and put data into d16 - blue, d17 - green,
1552 vshrn.u16 d17, q2, #3
1553 vshrn.u16 d18, q2, #8
1554 vraddhn.u16 d26, q13, q6
1556 vsri.u8 d17, d17, #6
1557 vsri.u8 d18, d18, #5
1559 * 'combine_over_ca' replacement
1561 * output: updated dest in d16 - blue, d17 - green, d18 - red
1564 vshrn.u16 d16, q2, #2
1566 vmull.u8 q7, d17, d25
1567 vmull.u8 q6, d16, d24
1568 vmull.u8 q11, d18, d26
1569 vst1.16 {d28, d29}, [DST_W, :128]!
1572 .macro pixman_composite_over_n_8888_0565_ca_init
1573 add DUMMY, sp, #ARGS_STACK_OFFSET
1575 vld1.32 {d11[0]}, [DUMMY]
1582 .macro pixman_composite_over_n_8888_0565_ca_cleanup
1586 generate_composite_function \
1587 pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
1588 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1589 8, /* number of pixels, processed in a single block */ \
1590 5, /* prefetch distance */ \
1591 pixman_composite_over_n_8888_0565_ca_init, \
1592 pixman_composite_over_n_8888_0565_ca_cleanup, \
1593 pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
1594 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
1595 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
1597 /******************************************************************************/
1599 .macro pixman_composite_in_n_8_process_pixblock_head
1600 /* expecting source data in {d0, d1, d2, d3} */
1601 /* and destination data in {d4, d5, d6, d7} */
1604 vmull.u8 q10, d6, d3
1605 vmull.u8 q11, d7, d3
1608 .macro pixman_composite_in_n_8_process_pixblock_tail
1609 vrshr.u16 q14, q8, #8
1610 vrshr.u16 q15, q9, #8
1611 vrshr.u16 q12, q10, #8
1612 vrshr.u16 q13, q11, #8
1613 vraddhn.u16 d28, q8, q14
1614 vraddhn.u16 d29, q9, q15
1615 vraddhn.u16 d30, q10, q12
1616 vraddhn.u16 d31, q11, q13
1619 .macro pixman_composite_in_n_8_process_pixblock_tail_head
1620 pixman_composite_in_n_8_process_pixblock_tail
1621 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1622 cache_preload 32, 32
1623 pixman_composite_in_n_8_process_pixblock_head
1624 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1627 .macro pixman_composite_in_n_8_init
1628 add DUMMY, sp, #ARGS_STACK_OFFSET
1629 vld1.32 {d3[0]}, [DUMMY]
1633 .macro pixman_composite_in_n_8_cleanup
1636 generate_composite_function \
1637 pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
1638 FLAG_DST_READWRITE, \
1639 32, /* number of pixels, processed in a single block */ \
1640 5, /* prefetch distance */ \
1641 pixman_composite_in_n_8_init, \
1642 pixman_composite_in_n_8_cleanup, \
1643 pixman_composite_in_n_8_process_pixblock_head, \
1644 pixman_composite_in_n_8_process_pixblock_tail, \
1645 pixman_composite_in_n_8_process_pixblock_tail_head, \
1646 28, /* dst_w_basereg */ \
1647 4, /* dst_r_basereg */ \
1648 0, /* src_basereg */ \
1649 24 /* mask_basereg */
1651 .macro pixman_composite_add_n_8_8_process_pixblock_head
1652 /* expecting source data in {d8, d9, d10, d11} */
1653 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1654 /* and destination data in {d4, d5, d6, d7} */
1655 /* mask is in d24, d25, d26, d27 */
1656 vmull.u8 q0, d24, d11
1657 vmull.u8 q1, d25, d11
1658 vmull.u8 q6, d26, d11
1659 vmull.u8 q7, d27, d11
1660 vrshr.u16 q10, q0, #8
1661 vrshr.u16 q11, q1, #8
1662 vrshr.u16 q12, q6, #8
1663 vrshr.u16 q13, q7, #8
1664 vraddhn.u16 d0, q0, q10
1665 vraddhn.u16 d1, q1, q11
1666 vraddhn.u16 d2, q6, q12
1667 vraddhn.u16 d3, q7, q13
1668 vqadd.u8 q14, q0, q2
1669 vqadd.u8 q15, q1, q3
1672 .macro pixman_composite_add_n_8_8_process_pixblock_tail
1675 /* TODO: expand macros and do better instructions scheduling */
1676 .macro pixman_composite_add_n_8_8_process_pixblock_tail_head
1677 pixman_composite_add_n_8_8_process_pixblock_tail
1678 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1679 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1681 cache_preload 32, 32
1682 pixman_composite_add_n_8_8_process_pixblock_head
1685 .macro pixman_composite_add_n_8_8_init
1686 add DUMMY, sp, #ARGS_STACK_OFFSET
1688 vld1.32 {d11[0]}, [DUMMY]
1692 .macro pixman_composite_add_n_8_8_cleanup
1696 generate_composite_function \
1697 pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
1698 FLAG_DST_READWRITE, \
1699 32, /* number of pixels, processed in a single block */ \
1700 5, /* prefetch distance */ \
1701 pixman_composite_add_n_8_8_init, \
1702 pixman_composite_add_n_8_8_cleanup, \
1703 pixman_composite_add_n_8_8_process_pixblock_head, \
1704 pixman_composite_add_n_8_8_process_pixblock_tail, \
1705 pixman_composite_add_n_8_8_process_pixblock_tail_head
1707 /******************************************************************************/
1709 .macro pixman_composite_add_8_8_8_process_pixblock_head
1710 /* expecting source data in {d0, d1, d2, d3} */
1711 /* destination data in {d4, d5, d6, d7} */
1712 /* mask in {d24, d25, d26, d27} */
1713 vmull.u8 q8, d24, d0
1714 vmull.u8 q9, d25, d1
1715 vmull.u8 q10, d26, d2
1716 vmull.u8 q11, d27, d3
1717 vrshr.u16 q0, q8, #8
1718 vrshr.u16 q1, q9, #8
1719 vrshr.u16 q12, q10, #8
1720 vrshr.u16 q13, q11, #8
1721 vraddhn.u16 d0, q0, q8
1722 vraddhn.u16 d1, q1, q9
1723 vraddhn.u16 d2, q12, q10
1724 vraddhn.u16 d3, q13, q11
1725 vqadd.u8 q14, q0, q2
1726 vqadd.u8 q15, q1, q3
1729 .macro pixman_composite_add_8_8_8_process_pixblock_tail
1732 /* TODO: expand macros and do better instructions scheduling */
1733 .macro pixman_composite_add_8_8_8_process_pixblock_tail_head
1734 pixman_composite_add_8_8_8_process_pixblock_tail
1735 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1736 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1739 cache_preload 32, 32
1740 pixman_composite_add_8_8_8_process_pixblock_head
1743 .macro pixman_composite_add_8_8_8_init
1746 .macro pixman_composite_add_8_8_8_cleanup
1749 generate_composite_function \
1750 pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
1751 FLAG_DST_READWRITE, \
1752 32, /* number of pixels, processed in a single block */ \
1753 5, /* prefetch distance */ \
1754 pixman_composite_add_8_8_8_init, \
1755 pixman_composite_add_8_8_8_cleanup, \
1756 pixman_composite_add_8_8_8_process_pixblock_head, \
1757 pixman_composite_add_8_8_8_process_pixblock_tail, \
1758 pixman_composite_add_8_8_8_process_pixblock_tail_head
1760 /******************************************************************************/
1762 .macro pixman_composite_add_8888_8888_8888_process_pixblock_head
1763 /* expecting source data in {d0, d1, d2, d3} */
1764 /* destination data in {d4, d5, d6, d7} */
1765 /* mask in {d24, d25, d26, d27} */
1766 vmull.u8 q8, d27, d0
1767 vmull.u8 q9, d27, d1
1768 vmull.u8 q10, d27, d2
1769 vmull.u8 q11, d27, d3
1770 /* 1 cycle bubble */
1771 vrsra.u16 q8, q8, #8
1772 vrsra.u16 q9, q9, #8
1773 vrsra.u16 q10, q10, #8
1774 vrsra.u16 q11, q11, #8
1777 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
1778 /* 2 cycle bubble */
1779 vrshrn.u16 d28, q8, #8
1780 vrshrn.u16 d29, q9, #8
1781 vrshrn.u16 d30, q10, #8
1782 vrshrn.u16 d31, q11, #8
1783 vqadd.u8 q14, q2, q14
1784 /* 1 cycle bubble */
1785 vqadd.u8 q15, q3, q15
1788 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1790 vrshrn.u16 d28, q8, #8
1792 vrshrn.u16 d29, q9, #8
1793 vmull.u8 q8, d27, d0
1794 vrshrn.u16 d30, q10, #8
1795 vmull.u8 q9, d27, d1
1796 vrshrn.u16 d31, q11, #8
1797 vmull.u8 q10, d27, d2
1798 vqadd.u8 q14, q2, q14
1799 vmull.u8 q11, d27, d3
1800 vqadd.u8 q15, q3, q15
1801 vrsra.u16 q8, q8, #8
1802 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1803 vrsra.u16 q9, q9, #8
1804 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1805 vrsra.u16 q10, q10, #8
1809 vrsra.u16 q11, q11, #8
1812 generate_composite_function \
1813 pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
1814 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1815 8, /* number of pixels, processed in a single block */ \
1816 10, /* prefetch distance */ \
1819 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1820 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1821 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1823 generate_composite_function_single_scanline \
1824 pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
1825 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1826 8, /* number of pixels, processed in a single block */ \
1829 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1830 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1831 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1833 /******************************************************************************/
1835 generate_composite_function \
1836 pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
1837 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1838 8, /* number of pixels, processed in a single block */ \
1839 5, /* prefetch distance */ \
1842 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1843 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1844 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
1845 28, /* dst_w_basereg */ \
1846 4, /* dst_r_basereg */ \
1847 0, /* src_basereg */ \
1848 27 /* mask_basereg */
1850 /******************************************************************************/
1852 .macro pixman_composite_add_n_8_8888_init
1853 add DUMMY, sp, #ARGS_STACK_OFFSET
1854 vld1.32 {d3[0]}, [DUMMY]
1861 .macro pixman_composite_add_n_8_8888_cleanup
1864 generate_composite_function \
1865 pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
1866 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1867 8, /* number of pixels, processed in a single block */ \
1868 5, /* prefetch distance */ \
1869 pixman_composite_add_n_8_8888_init, \
1870 pixman_composite_add_n_8_8888_cleanup, \
1871 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1872 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1873 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
1874 28, /* dst_w_basereg */ \
1875 4, /* dst_r_basereg */ \
1876 0, /* src_basereg */ \
1877 27 /* mask_basereg */
1879 /******************************************************************************/
1881 .macro pixman_composite_add_8888_n_8888_init
1882 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
1883 vld1.32 {d27[0]}, [DUMMY]
1887 .macro pixman_composite_add_8888_n_8888_cleanup
1890 generate_composite_function \
1891 pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
1892 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1893 8, /* number of pixels, processed in a single block */ \
1894 5, /* prefetch distance */ \
1895 pixman_composite_add_8888_n_8888_init, \
1896 pixman_composite_add_8888_n_8888_cleanup, \
1897 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1898 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1899 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
1900 28, /* dst_w_basereg */ \
1901 4, /* dst_r_basereg */ \
1902 0, /* src_basereg */ \
1903 27 /* mask_basereg */
1905 /******************************************************************************/
1907 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
1908 /* expecting source data in {d0, d1, d2, d3} */
1909 /* destination data in {d4, d5, d6, d7} */
1910 /* solid mask is in d15 */
1913 vmull.u8 q8, d15, d3
1914 vmull.u8 q6, d15, d2
1915 vmull.u8 q5, d15, d1
1916 vmull.u8 q4, d15, d0
1917 vrshr.u16 q13, q8, #8
1918 vrshr.u16 q12, q6, #8
1919 vrshr.u16 q11, q5, #8
1920 vrshr.u16 q10, q4, #8
1921 vraddhn.u16 d3, q8, q13
1922 vraddhn.u16 d2, q6, q12
1923 vraddhn.u16 d1, q5, q11
1924 vraddhn.u16 d0, q4, q10
1925 vmvn.8 d24, d3 /* get inverted alpha */
1926 /* now do alpha blending */
1927 vmull.u8 q8, d24, d4
1928 vmull.u8 q9, d24, d5
1929 vmull.u8 q10, d24, d6
1930 vmull.u8 q11, d24, d7
1933 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
1934 vrshr.u16 q14, q8, #8
1935 vrshr.u16 q15, q9, #8
1936 vrshr.u16 q12, q10, #8
1937 vrshr.u16 q13, q11, #8
1938 vraddhn.u16 d28, q14, q8
1939 vraddhn.u16 d29, q15, q9
1940 vraddhn.u16 d30, q12, q10
1941 vraddhn.u16 d31, q13, q11
1944 /* TODO: expand macros and do better instructions scheduling */
1945 .macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
1946 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1947 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
1951 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
1952 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1955 generate_composite_function_single_scanline \
1956 pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
1957 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1958 8, /* number of pixels, processed in a single block */ \
1959 default_init_need_all_regs, \
1960 default_cleanup_need_all_regs, \
1961 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
1962 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
1963 pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
1964 28, /* dst_w_basereg */ \
1965 4, /* dst_r_basereg */ \
1966 0, /* src_basereg */ \
1967 12 /* mask_basereg */
1969 /******************************************************************************/
1971 .macro pixman_composite_over_8888_n_8888_process_pixblock_head
1972 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
1975 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail
1976 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
1977 vqadd.u8 q14, q0, q14
1978 vqadd.u8 q15, q1, q15
1981 /* TODO: expand macros and do better instructions scheduling */
1982 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
1983 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1984 pixman_composite_over_8888_n_8888_process_pixblock_tail
1987 pixman_composite_over_8888_n_8888_process_pixblock_head
1988 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1991 .macro pixman_composite_over_8888_n_8888_init
1994 vld1.32 {d15[0]}, [DUMMY]
1998 .macro pixman_composite_over_8888_n_8888_cleanup
2002 generate_composite_function \
2003 pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
2004 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2005 8, /* number of pixels, processed in a single block */ \
2006 5, /* prefetch distance */ \
2007 pixman_composite_over_8888_n_8888_init, \
2008 pixman_composite_over_8888_n_8888_cleanup, \
2009 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2010 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2011 pixman_composite_over_8888_n_8888_process_pixblock_tail_head
2013 /******************************************************************************/
2015 /* TODO: expand macros and do better instructions scheduling */
2016 .macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
2017 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2018 pixman_composite_over_8888_n_8888_process_pixblock_tail
2022 pixman_composite_over_8888_n_8888_process_pixblock_head
2023 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2026 generate_composite_function \
2027 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
2028 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2029 8, /* number of pixels, processed in a single block */ \
2030 5, /* prefetch distance */ \
2031 default_init_need_all_regs, \
2032 default_cleanup_need_all_regs, \
2033 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2034 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2035 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
2036 28, /* dst_w_basereg */ \
2037 4, /* dst_r_basereg */ \
2038 0, /* src_basereg */ \
2039 12 /* mask_basereg */
2041 generate_composite_function_single_scanline \
2042 pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
2043 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2044 8, /* number of pixels, processed in a single block */ \
2045 default_init_need_all_regs, \
2046 default_cleanup_need_all_regs, \
2047 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2048 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2049 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
2050 28, /* dst_w_basereg */ \
2051 4, /* dst_r_basereg */ \
2052 0, /* src_basereg */ \
2053 12 /* mask_basereg */
2055 /******************************************************************************/
2057 /* TODO: expand macros and do better instructions scheduling */
2058 .macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
2059 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2060 pixman_composite_over_8888_n_8888_process_pixblock_tail
2064 pixman_composite_over_8888_n_8888_process_pixblock_head
2065 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2068 generate_composite_function \
2069 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
2070 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2071 8, /* number of pixels, processed in a single block */ \
2072 5, /* prefetch distance */ \
2073 default_init_need_all_regs, \
2074 default_cleanup_need_all_regs, \
2075 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2076 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2077 pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
2078 28, /* dst_w_basereg */ \
2079 4, /* dst_r_basereg */ \
2080 0, /* src_basereg */ \
2081 15 /* mask_basereg */
2083 /******************************************************************************/
2085 .macro pixman_composite_src_0888_0888_process_pixblock_head
2088 .macro pixman_composite_src_0888_0888_process_pixblock_tail
2091 .macro pixman_composite_src_0888_0888_process_pixblock_tail_head
2092 vst3.8 {d0, d1, d2}, [DST_W]!
2097 generate_composite_function \
2098 pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
2099 FLAG_DST_WRITEONLY, \
2100 8, /* number of pixels, processed in a single block */ \
2101 10, /* prefetch distance */ \
2104 pixman_composite_src_0888_0888_process_pixblock_head, \
2105 pixman_composite_src_0888_0888_process_pixblock_tail, \
2106 pixman_composite_src_0888_0888_process_pixblock_tail_head, \
2107 0, /* dst_w_basereg */ \
2108 0, /* dst_r_basereg */ \
2109 0, /* src_basereg */ \
2110 0 /* mask_basereg */
2112 /******************************************************************************/
2114 .macro pixman_composite_src_0888_8888_rev_process_pixblock_head
2118 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
2121 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
2122 vst4.8 {d0, d1, d2, d3}, [DST_W]!
2128 .macro pixman_composite_src_0888_8888_rev_init
2132 generate_composite_function \
2133 pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
2134 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2135 8, /* number of pixels, processed in a single block */ \
2136 10, /* prefetch distance */ \
2137 pixman_composite_src_0888_8888_rev_init, \
2139 pixman_composite_src_0888_8888_rev_process_pixblock_head, \
2140 pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
2141 pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
2142 0, /* dst_w_basereg */ \
2143 0, /* dst_r_basereg */ \
2144 0, /* src_basereg */ \
2145 0 /* mask_basereg */
2147 /******************************************************************************/
2149 .macro pixman_composite_src_0888_0565_rev_process_pixblock_head
2154 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
2155 vshll.u8 q14, d0, #8
2156 vsri.u16 q14, q8, #5
2157 vsri.u16 q14, q9, #11
2160 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
2161 vshll.u8 q14, d0, #8
2163 vsri.u16 q14, q8, #5
2164 vsri.u16 q14, q9, #11
2166 vst1.16 {d28, d29}, [DST_W, :128]!
2170 generate_composite_function \
2171 pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
2172 FLAG_DST_WRITEONLY, \
2173 8, /* number of pixels, processed in a single block */ \
2174 10, /* prefetch distance */ \
2177 pixman_composite_src_0888_0565_rev_process_pixblock_head, \
2178 pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
2179 pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
2180 28, /* dst_w_basereg */ \
2181 0, /* dst_r_basereg */ \
2182 0, /* src_basereg */ \
2183 0 /* mask_basereg */
2185 /******************************************************************************/
2187 .macro pixman_composite_src_pixbuf_8888_process_pixblock_head
2190 vmull.u8 q10, d3, d2
2193 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
2194 vrshr.u16 q11, q8, #8
2196 vrshr.u16 q12, q9, #8
2197 vrshr.u16 q13, q10, #8
2198 vraddhn.u16 d30, q11, q8
2199 vraddhn.u16 d29, q12, q9
2200 vraddhn.u16 d28, q13, q10
2203 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
2204 vrshr.u16 q11, q8, #8
2206 vrshr.u16 q12, q9, #8
2207 vrshr.u16 q13, q10, #8
2209 vraddhn.u16 d30, q11, q8
2210 PF add PF_X, PF_X, #8
2212 PF addne PF_X, PF_X, #8
2213 PF subne PF_CTL, PF_CTL, #1
2214 vraddhn.u16 d29, q12, q9
2215 vraddhn.u16 d28, q13, q10
2218 vmull.u8 q10, d3, d2
2219 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2221 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
2222 PF subge PF_X, PF_X, ORIG_W
2223 PF subges PF_CTL, PF_CTL, #0x10
2224 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2227 generate_composite_function \
2228 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
2229 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2230 8, /* number of pixels, processed in a single block */ \
2231 10, /* prefetch distance */ \
2234 pixman_composite_src_pixbuf_8888_process_pixblock_head, \
2235 pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
2236 pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
2237 28, /* dst_w_basereg */ \
2238 0, /* dst_r_basereg */ \
2239 0, /* src_basereg */ \
2240 0 /* mask_basereg */
2242 /******************************************************************************/
2244 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
2247 vmull.u8 q10, d3, d2
2250 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
2251 vrshr.u16 q11, q8, #8
2253 vrshr.u16 q12, q9, #8
2254 vrshr.u16 q13, q10, #8
2255 vraddhn.u16 d28, q11, q8
2256 vraddhn.u16 d29, q12, q9
2257 vraddhn.u16 d30, q13, q10
2260 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
2261 vrshr.u16 q11, q8, #8
2263 vrshr.u16 q12, q9, #8
2264 vrshr.u16 q13, q10, #8
2266 vraddhn.u16 d28, q11, q8
2267 PF add PF_X, PF_X, #8
2269 PF addne PF_X, PF_X, #8
2270 PF subne PF_CTL, PF_CTL, #1
2271 vraddhn.u16 d29, q12, q9
2272 vraddhn.u16 d30, q13, q10
2275 vmull.u8 q10, d3, d2
2276 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2278 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
2279 PF subge PF_X, PF_X, ORIG_W
2280 PF subges PF_CTL, PF_CTL, #0x10
2281 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2284 generate_composite_function \
2285 pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
2286 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2287 8, /* number of pixels, processed in a single block */ \
2288 10, /* prefetch distance */ \
2291 pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
2292 pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
2293 pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
2294 28, /* dst_w_basereg */ \
2295 0, /* dst_r_basereg */ \
2296 0, /* src_basereg */ \
2297 0 /* mask_basereg */
2299 /******************************************************************************/
2301 .macro pixman_composite_over_0565_8_0565_process_pixblock_head
2302 /* mask is in d15 */
2303 convert_0565_to_x888 q4, d2, d1, d0
2304 convert_0565_to_x888 q5, d6, d5, d4
2305 /* source pixel data is in {d0, d1, d2, XX} */
2306 /* destination pixel data is in {d4, d5, d6, XX} */
2308 vmull.u8 q6, d15, d2
2309 vmull.u8 q5, d15, d1
2310 vmull.u8 q4, d15, d0
2313 vmull.u8 q13, d7, d6
2314 vrshr.u16 q12, q6, #8
2315 vrshr.u16 q11, q5, #8
2316 vrshr.u16 q10, q4, #8
2317 vraddhn.u16 d2, q6, q12
2318 vraddhn.u16 d1, q5, q11
2319 vraddhn.u16 d0, q4, q10
2322 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail
2323 vrshr.u16 q14, q8, #8
2324 vrshr.u16 q15, q9, #8
2325 vrshr.u16 q12, q13, #8
2326 vraddhn.u16 d28, q14, q8
2327 vraddhn.u16 d29, q15, q9
2328 vraddhn.u16 d30, q12, q13
2329 vqadd.u8 q0, q0, q14
2330 vqadd.u8 q1, q1, q15
2331 /* 32bpp result is in {d0, d1, d2, XX} */
2332 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2335 /* TODO: expand macros and do better instructions scheduling */
2336 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
2338 pixman_composite_over_0565_8_0565_process_pixblock_tail
2340 vld1.16 {d10, d11}, [DST_R, :128]!
2342 pixman_composite_over_0565_8_0565_process_pixblock_head
2343 vst1.16 {d28, d29}, [DST_W, :128]!
2346 generate_composite_function \
2347 pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
2348 FLAG_DST_READWRITE, \
2349 8, /* number of pixels, processed in a single block */ \
2350 5, /* prefetch distance */ \
2351 default_init_need_all_regs, \
2352 default_cleanup_need_all_regs, \
2353 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2354 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2355 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2356 28, /* dst_w_basereg */ \
2357 10, /* dst_r_basereg */ \
2358 8, /* src_basereg */ \
2359 15 /* mask_basereg */
2361 /******************************************************************************/
2363 .macro pixman_composite_over_0565_n_0565_init
2364 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
2366 vld1.32 {d15[0]}, [DUMMY]
2370 .macro pixman_composite_over_0565_n_0565_cleanup
2374 generate_composite_function \
2375 pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
2376 FLAG_DST_READWRITE, \
2377 8, /* number of pixels, processed in a single block */ \
2378 5, /* prefetch distance */ \
2379 pixman_composite_over_0565_n_0565_init, \
2380 pixman_composite_over_0565_n_0565_cleanup, \
2381 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2382 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2383 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2384 28, /* dst_w_basereg */ \
2385 10, /* dst_r_basereg */ \
2386 8, /* src_basereg */ \
2387 15 /* mask_basereg */
2389 /******************************************************************************/
2391 .macro pixman_composite_add_0565_8_0565_process_pixblock_head
2392 /* mask is in d15 */
2393 convert_0565_to_x888 q4, d2, d1, d0
2394 convert_0565_to_x888 q5, d6, d5, d4
2395 /* source pixel data is in {d0, d1, d2, XX} */
2396 /* destination pixel data is in {d4, d5, d6, XX} */
2397 vmull.u8 q6, d15, d2
2398 vmull.u8 q5, d15, d1
2399 vmull.u8 q4, d15, d0
2400 vrshr.u16 q12, q6, #8
2401 vrshr.u16 q11, q5, #8
2402 vrshr.u16 q10, q4, #8
2403 vraddhn.u16 d2, q6, q12
2404 vraddhn.u16 d1, q5, q11
2405 vraddhn.u16 d0, q4, q10
2408 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail
2411 /* 32bpp result is in {d0, d1, d2, XX} */
2412 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2415 /* TODO: expand macros and do better instructions scheduling */
2416 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
2418 pixman_composite_add_0565_8_0565_process_pixblock_tail
2420 vld1.16 {d10, d11}, [DST_R, :128]!
2422 pixman_composite_add_0565_8_0565_process_pixblock_head
2423 vst1.16 {d28, d29}, [DST_W, :128]!
2426 generate_composite_function \
2427 pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
2428 FLAG_DST_READWRITE, \
2429 8, /* number of pixels, processed in a single block */ \
2430 5, /* prefetch distance */ \
2431 default_init_need_all_regs, \
2432 default_cleanup_need_all_regs, \
2433 pixman_composite_add_0565_8_0565_process_pixblock_head, \
2434 pixman_composite_add_0565_8_0565_process_pixblock_tail, \
2435 pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
2436 28, /* dst_w_basereg */ \
2437 10, /* dst_r_basereg */ \
2438 8, /* src_basereg */ \
2439 15 /* mask_basereg */
2441 /******************************************************************************/
2443 .macro pixman_composite_out_reverse_8_0565_process_pixblock_head
2444 /* mask is in d15 */
2445 convert_0565_to_x888 q5, d6, d5, d4
2446 /* destination pixel data is in {d4, d5, d6, xx} */
2447 vmvn.8 d24, d15 /* get inverted alpha */
2448 /* now do alpha blending */
2449 vmull.u8 q8, d24, d4
2450 vmull.u8 q9, d24, d5
2451 vmull.u8 q10, d24, d6
2454 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
2455 vrshr.u16 q14, q8, #8
2456 vrshr.u16 q15, q9, #8
2457 vrshr.u16 q12, q10, #8
2458 vraddhn.u16 d0, q14, q8
2459 vraddhn.u16 d1, q15, q9
2460 vraddhn.u16 d2, q12, q10
2461 /* 32bpp result is in {d0, d1, d2, XX} */
2462 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2465 /* TODO: expand macros and do better instructions scheduling */
2466 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
2468 pixman_composite_out_reverse_8_0565_process_pixblock_tail
2469 vld1.16 {d10, d11}, [DST_R, :128]!
2471 pixman_composite_out_reverse_8_0565_process_pixblock_head
2472 vst1.16 {d28, d29}, [DST_W, :128]!
2475 generate_composite_function \
2476 pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
2477 FLAG_DST_READWRITE, \
2478 8, /* number of pixels, processed in a single block */ \
2479 5, /* prefetch distance */ \
2480 default_init_need_all_regs, \
2481 default_cleanup_need_all_regs, \
2482 pixman_composite_out_reverse_8_0565_process_pixblock_head, \
2483 pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
2484 pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
2485 28, /* dst_w_basereg */ \
2486 10, /* dst_r_basereg */ \
2487 15, /* src_basereg */ \
2488 0 /* mask_basereg */
2490 /******************************************************************************/
2492 .macro pixman_composite_out_reverse_8_8888_process_pixblock_head
2494 /* destination pixel data is in {d4, d5, d6, d7} */
2495 vmvn.8 d1, d0 /* get inverted alpha */
2496 /* now do alpha blending */
2499 vmull.u8 q10, d1, d6
2500 vmull.u8 q11, d1, d7
2503 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
2504 vrshr.u16 q14, q8, #8
2505 vrshr.u16 q15, q9, #8
2506 vrshr.u16 q12, q10, #8
2507 vrshr.u16 q13, q11, #8
2508 vraddhn.u16 d28, q14, q8
2509 vraddhn.u16 d29, q15, q9
2510 vraddhn.u16 d30, q12, q10
2511 vraddhn.u16 d31, q13, q11
2512 /* 32bpp result is in {d28, d29, d30, d31} */
2515 /* TODO: expand macros and do better instructions scheduling */
2516 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
2518 pixman_composite_out_reverse_8_8888_process_pixblock_tail
2519 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2521 pixman_composite_out_reverse_8_8888_process_pixblock_head
2522 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2525 generate_composite_function \
2526 pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
2527 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2528 8, /* number of pixels, processed in a single block */ \
2529 5, /* prefetch distance */ \
2532 pixman_composite_out_reverse_8_8888_process_pixblock_head, \
2533 pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
2534 pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
2535 28, /* dst_w_basereg */ \
2536 4, /* dst_r_basereg */ \
2537 0, /* src_basereg */ \
2538 0 /* mask_basereg */
2540 /******************************************************************************/
2542 generate_composite_function_nearest_scanline \
2543 pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
2544 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2545 8, /* number of pixels, processed in a single block */ \
2548 pixman_composite_over_8888_8888_process_pixblock_head, \
2549 pixman_composite_over_8888_8888_process_pixblock_tail, \
2550 pixman_composite_over_8888_8888_process_pixblock_tail_head
2552 generate_composite_function_nearest_scanline \
2553 pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
2554 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2555 8, /* number of pixels, processed in a single block */ \
2558 pixman_composite_over_8888_0565_process_pixblock_head, \
2559 pixman_composite_over_8888_0565_process_pixblock_tail, \
2560 pixman_composite_over_8888_0565_process_pixblock_tail_head, \
2561 28, /* dst_w_basereg */ \
2562 4, /* dst_r_basereg */ \
2563 0, /* src_basereg */ \
2564 24 /* mask_basereg */
2566 generate_composite_function_nearest_scanline \
2567 pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
2568 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2569 8, /* number of pixels, processed in a single block */ \
2572 pixman_composite_src_8888_0565_process_pixblock_head, \
2573 pixman_composite_src_8888_0565_process_pixblock_tail, \
2574 pixman_composite_src_8888_0565_process_pixblock_tail_head
2576 generate_composite_function_nearest_scanline \
2577 pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
2578 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2579 8, /* number of pixels, processed in a single block */ \
2582 pixman_composite_src_0565_8888_process_pixblock_head, \
2583 pixman_composite_src_0565_8888_process_pixblock_tail, \
2584 pixman_composite_src_0565_8888_process_pixblock_tail_head
2586 generate_composite_function_nearest_scanline \
2587 pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
2588 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2589 8, /* number of pixels, processed in a single block */ \
2590 default_init_need_all_regs, \
2591 default_cleanup_need_all_regs, \
2592 pixman_composite_over_8888_8_0565_process_pixblock_head, \
2593 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
2594 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
2595 28, /* dst_w_basereg */ \
2596 4, /* dst_r_basereg */ \
2597 8, /* src_basereg */ \
2598 24 /* mask_basereg */
2600 generate_composite_function_nearest_scanline \
2601 pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
2602 FLAG_DST_READWRITE, \
2603 8, /* number of pixels, processed in a single block */ \
2604 default_init_need_all_regs, \
2605 default_cleanup_need_all_regs, \
2606 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2607 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2608 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2609 28, /* dst_w_basereg */ \
2610 10, /* dst_r_basereg */ \
2611 8, /* src_basereg */ \
2612 15 /* mask_basereg */
2614 /******************************************************************************/
2616 /* Supplementary macro for setting function attributes */
2617 .macro pixman_asm_function fname
2622 .type fname, %function
2628 * Bilinear scaling support code which tries to provide pixel fetching, color
2629 * format conversion, and interpolation as separate macros which can be used
2630 * as the basic building blocks for constructing bilinear scanline functions.
2633 .macro bilinear_load_8888 reg1, reg2, tmp
2634 mov TMP1, X, asr #16
2636 add TMP1, TOP, TMP1, asl #2
2637 vld1.32 {reg1}, [TMP1], STRIDE
2638 vld1.32 {reg2}, [TMP1]
2641 .macro bilinear_load_0565 reg1, reg2, tmp
2642 mov TMP1, X, asr #16
2644 add TMP1, TOP, TMP1, asl #1
2645 vld1.32 {reg2[0]}, [TMP1], STRIDE
2646 vld1.32 {reg2[1]}, [TMP1]
2647 convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
2650 .macro bilinear_load_and_vertical_interpolate_two_8888 \
2651 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
2653 bilinear_load_8888 reg1, reg2, tmp1
2654 vmull.u8 acc1, reg1, d28
2655 vmlal.u8 acc1, reg2, d29
2656 bilinear_load_8888 reg3, reg4, tmp2
2657 vmull.u8 acc2, reg3, d28
2658 vmlal.u8 acc2, reg4, d29
2661 .macro bilinear_load_and_vertical_interpolate_four_8888 \
2662 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2663 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2665 bilinear_load_and_vertical_interpolate_two_8888 \
2666 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
2667 bilinear_load_and_vertical_interpolate_two_8888 \
2668 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2671 .macro bilinear_load_and_vertical_interpolate_two_0565 \
2672 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
2674 mov TMP1, X, asr #16
2676 add TMP1, TOP, TMP1, asl #1
2677 mov TMP2, X, asr #16
2679 add TMP2, TOP, TMP2, asl #1
2680 vld1.32 {acc2lo[0]}, [TMP1], STRIDE
2681 vld1.32 {acc2hi[0]}, [TMP2], STRIDE
2682 vld1.32 {acc2lo[1]}, [TMP1]
2683 vld1.32 {acc2hi[1]}, [TMP2]
2684 convert_0565_to_x888 acc2, reg3, reg2, reg1
2689 vmull.u8 acc1, reg1, d28
2690 vmlal.u8 acc1, reg2, d29
2691 vmull.u8 acc2, reg3, d28
2692 vmlal.u8 acc2, reg4, d29
2695 .macro bilinear_load_and_vertical_interpolate_four_0565 \
2696 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2697 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2699 mov TMP1, X, asr #16
2701 add TMP1, TOP, TMP1, asl #1
2702 mov TMP2, X, asr #16
2704 add TMP2, TOP, TMP2, asl #1
2705 vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
2706 vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
2707 vld1.32 {xacc2lo[1]}, [TMP1]
2708 vld1.32 {xacc2hi[1]}, [TMP2]
2709 convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
2710 mov TMP1, X, asr #16
2712 add TMP1, TOP, TMP1, asl #1
2713 mov TMP2, X, asr #16
2715 add TMP2, TOP, TMP2, asl #1
2716 vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
2717 vzip.u8 xreg1, xreg3
2718 vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
2719 vzip.u8 xreg2, xreg4
2720 vld1.32 {yacc2lo[1]}, [TMP1]
2721 vzip.u8 xreg3, xreg4
2722 vld1.32 {yacc2hi[1]}, [TMP2]
2723 vzip.u8 xreg1, xreg2
2724 convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
2725 vmull.u8 xacc1, xreg1, d28
2726 vzip.u8 yreg1, yreg3
2727 vmlal.u8 xacc1, xreg2, d29
2728 vzip.u8 yreg2, yreg4
2729 vmull.u8 xacc2, xreg3, d28
2730 vzip.u8 yreg3, yreg4
2731 vmlal.u8 xacc2, xreg4, d29
2732 vzip.u8 yreg1, yreg2
2733 vmull.u8 yacc1, yreg1, d28
2734 vmlal.u8 yacc1, yreg2, d29
2735 vmull.u8 yacc2, yreg3, d28
2736 vmlal.u8 yacc2, yreg4, d29
2739 .macro bilinear_store_8888 numpix, tmp1, tmp2
2741 vst1.32 {d0, d1}, [OUT, :128]!
2743 vst1.32 {d0}, [OUT, :64]!
2745 vst1.32 {d0[0]}, [OUT, :32]!
2747 .error bilinear_store_8888 numpix is unsupported
2751 .macro bilinear_store_0565 numpix, tmp1, tmp2
2756 convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
2758 vst1.16 {d2}, [OUT, :64]!
2760 vst1.32 {d2[0]}, [OUT, :32]!
2762 vst1.16 {d2[0]}, [OUT, :16]!
2764 .error bilinear_store_0565 numpix is unsupported
2768 .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
2769 bilinear_load_&src_fmt d0, d1, d2
2770 vmull.u8 q1, d0, d28
2771 vmlal.u8 q1, d1, d29
2772 /* 5 cycles bubble */
2773 vshll.u16 q0, d2, #8
2774 vmlsl.u16 q0, d2, d30
2775 vmlal.u16 q0, d3, d30
2776 /* 5 cycles bubble */
2777 vshrn.u32 d0, q0, #16
2778 /* 3 cycles bubble */
2780 /* 1 cycle bubble */
2781 bilinear_store_&dst_fmt 1, q2, q3
2784 .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
2785 bilinear_load_and_vertical_interpolate_two_&src_fmt \
2786 q1, q11, d0, d1, d20, d21, d22, d23
2787 vshll.u16 q0, d2, #8
2788 vmlsl.u16 q0, d2, d30
2789 vmlal.u16 q0, d3, d30
2790 vshll.u16 q10, d22, #8
2791 vmlsl.u16 q10, d22, d31
2792 vmlal.u16 q10, d23, d31
2793 vshrn.u32 d0, q0, #16
2794 vshrn.u32 d1, q10, #16
2795 vshr.u16 q15, q12, #8
2796 vadd.u16 q12, q12, q13
2798 bilinear_store_&dst_fmt 2, q2, q3
2801 .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
2802 bilinear_load_and_vertical_interpolate_four_&src_fmt \
2803 q1, q11, d0, d1, d20, d21, d22, d23 \
2804 q3, q9, d4, d5, d16, d17, d18, d19
2806 sub TMP1, TMP1, STRIDE
2807 vshll.u16 q0, d2, #8
2808 vmlsl.u16 q0, d2, d30
2809 vmlal.u16 q0, d3, d30
2810 vshll.u16 q10, d22, #8
2811 vmlsl.u16 q10, d22, d31
2812 vmlal.u16 q10, d23, d31
2813 vshr.u16 q15, q12, #8
2814 vshll.u16 q2, d6, #8
2815 vmlsl.u16 q2, d6, d30
2816 vmlal.u16 q2, d7, d30
2817 vshll.u16 q8, d18, #8
2819 vmlsl.u16 q8, d18, d31
2820 vmlal.u16 q8, d19, d31
2821 vadd.u16 q12, q12, q13
2822 vshrn.u32 d0, q0, #16
2823 vshrn.u32 d1, q10, #16
2824 vshrn.u32 d4, q2, #16
2825 vshrn.u32 d5, q8, #16
2826 vshr.u16 q15, q12, #8
2829 vadd.u16 q12, q12, q13
2830 bilinear_store_&dst_fmt 4, q2, q3
2833 .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
2834 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
2835 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
2837 bilinear_interpolate_four_pixels src_fmt, dst_fmt
2841 .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
2842 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
2843 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
2847 .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
2848 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
2849 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
2851 bilinear_interpolate_four_pixels src_fmt, dst_fmt
2855 .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
2856 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
2857 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
2859 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
2860 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
2864 .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
2865 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
2866 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
2868 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
2872 .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
2873 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
2874 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
2876 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
2877 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
2881 .set BILINEAR_FLAG_UNROLL_4, 0
2882 .set BILINEAR_FLAG_UNROLL_8, 1
2883 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
2886 * Main template macro for generating NEON optimized bilinear scanline
2889 * Bilinear scanline scaler macro template uses the following arguments:
2890 * fname - name of the function to generate
2891 * src_fmt - source color format (8888 or 0565)
2892 * dst_fmt - destination color format (8888 or 0565)
2893 * bpp_shift - (1 << bpp_shift) is the size of source pixel in bytes
2894 * prefetch_distance - prefetch in the source image by that many
2898 .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
2899 src_bpp_shift, dst_bpp_shift, \
2900 prefetch_distance, flags
2902 pixman_asm_function fname
2919 push {r4, r5, r6, r7, r8, r9}
2920 mov PF_OFFS, #prefetch_distance
2921 ldmia ip, {WB, X, UX, WIDTH}
2922 mul PF_OFFS, PF_OFFS, UX
2924 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
2928 sub STRIDE, BOTTOM, TOP
2938 vadd.u16 d25, d25, d26
2940 /* ensure good destination alignment */
2943 tst OUT, #(1 << dst_bpp_shift)
2945 vshr.u16 q15, q12, #8
2946 vadd.u16 q12, q12, q13
2947 bilinear_interpolate_last_pixel src_fmt, dst_fmt
2948 sub WIDTH, WIDTH, #1
2950 vadd.u16 q13, q13, q13
2951 vshr.u16 q15, q12, #8
2952 vadd.u16 q12, q12, q13
2956 tst OUT, #(1 << (dst_bpp_shift + 1))
2958 bilinear_interpolate_two_pixels src_fmt, dst_fmt
2959 sub WIDTH, WIDTH, #2
2961 .if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
2962 /*********** 8 pixels per iteration *****************/
2965 tst OUT, #(1 << (dst_bpp_shift + 2))
2967 bilinear_interpolate_four_pixels src_fmt, dst_fmt
2968 sub WIDTH, WIDTH, #4
2970 subs WIDTH, WIDTH, #8
2972 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
2973 bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
2974 subs WIDTH, WIDTH, #8
2977 bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
2978 subs WIDTH, WIDTH, #8
2981 bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
2985 bilinear_interpolate_four_pixels src_fmt, dst_fmt
2988 /*********** 4 pixels per iteration *****************/
2989 subs WIDTH, WIDTH, #4
2991 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
2992 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
2993 subs WIDTH, WIDTH, #4
2996 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
2997 subs WIDTH, WIDTH, #4
3000 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
3002 /****************************************************/
3004 /* handle the remaining trailing pixels */
3007 bilinear_interpolate_two_pixels src_fmt, dst_fmt
3011 bilinear_interpolate_last_pixel src_fmt, dst_fmt
3013 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
3016 pop {r4, r5, r6, r7, r8, r9}
3036 /*****************************************************************************/
3038 .set have_bilinear_interpolate_four_pixels_8888_8888, 1
3040 .macro bilinear_interpolate_four_pixels_8888_8888_head
3041 mov TMP1, X, asr #16
3043 add TMP1, TOP, TMP1, asl #2
3044 mov TMP2, X, asr #16
3046 add TMP2, TOP, TMP2, asl #2
3048 vld1.32 {d22}, [TMP1], STRIDE
3049 vld1.32 {d23}, [TMP1]
3050 mov TMP3, X, asr #16
3052 add TMP3, TOP, TMP3, asl #2
3053 vmull.u8 q8, d22, d28
3054 vmlal.u8 q8, d23, d29
3056 vld1.32 {d22}, [TMP2], STRIDE
3057 vld1.32 {d23}, [TMP2]
3058 mov TMP4, X, asr #16
3060 add TMP4, TOP, TMP4, asl #2
3061 vmull.u8 q9, d22, d28
3062 vmlal.u8 q9, d23, d29
3064 vld1.32 {d22}, [TMP3], STRIDE
3065 vld1.32 {d23}, [TMP3]
3066 vmull.u8 q10, d22, d28
3067 vmlal.u8 q10, d23, d29
3069 vshll.u16 q0, d16, #8
3070 vmlsl.u16 q0, d16, d30
3071 vmlal.u16 q0, d17, d30
3074 vld1.32 {d16}, [TMP4], STRIDE
3075 vld1.32 {d17}, [TMP4]
3077 vmull.u8 q11, d16, d28
3078 vmlal.u8 q11, d17, d29
3080 vshll.u16 q1, d18, #8
3081 vmlsl.u16 q1, d18, d31
3084 .macro bilinear_interpolate_four_pixels_8888_8888_tail
3085 vmlal.u16 q1, d19, d31
3086 vshr.u16 q15, q12, #8
3087 vshll.u16 q2, d20, #8
3088 vmlsl.u16 q2, d20, d30
3089 vmlal.u16 q2, d21, d30
3090 vshll.u16 q3, d22, #8
3091 vmlsl.u16 q3, d22, d31
3092 vmlal.u16 q3, d23, d31
3093 vadd.u16 q12, q12, q13
3094 vshrn.u32 d0, q0, #16
3095 vshrn.u32 d1, q1, #16
3096 vshrn.u32 d4, q2, #16
3097 vshr.u16 q15, q12, #8
3098 vshrn.u32 d5, q3, #16
3101 vadd.u16 q12, q12, q13
3102 vst1.32 {d6, d7}, [OUT, :128]!
3105 .macro bilinear_interpolate_four_pixels_8888_8888_tail_head
3106 mov TMP1, X, asr #16
3108 add TMP1, TOP, TMP1, asl #2
3109 mov TMP2, X, asr #16
3111 add TMP2, TOP, TMP2, asl #2
3112 vmlal.u16 q1, d19, d31
3113 vshr.u16 q15, q12, #8
3114 vshll.u16 q2, d20, #8
3115 vmlsl.u16 q2, d20, d30
3116 vmlal.u16 q2, d21, d30
3117 vshll.u16 q3, d22, #8
3118 vld1.32 {d20}, [TMP1], STRIDE
3119 vmlsl.u16 q3, d22, d31
3120 vmlal.u16 q3, d23, d31
3121 vld1.32 {d21}, [TMP1]
3122 vmull.u8 q8, d20, d28
3123 vmlal.u8 q8, d21, d29
3124 vshrn.u32 d0, q0, #16
3125 vshrn.u32 d1, q1, #16
3126 vshrn.u32 d4, q2, #16
3127 vld1.32 {d22}, [TMP2], STRIDE
3128 vshrn.u32 d5, q3, #16
3129 vadd.u16 q12, q12, q13
3130 vld1.32 {d23}, [TMP2]
3131 vmull.u8 q9, d22, d28
3132 mov TMP3, X, asr #16
3134 add TMP3, TOP, TMP3, asl #2
3135 mov TMP4, X, asr #16
3137 add TMP4, TOP, TMP4, asl #2
3138 vmlal.u8 q9, d23, d29
3139 vld1.32 {d22}, [TMP3], STRIDE
3140 vshr.u16 q15, q12, #8
3141 vld1.32 {d23}, [TMP3]
3142 vmull.u8 q10, d22, d28
3143 vmlal.u8 q10, d23, d29
3145 vshll.u16 q0, d16, #8
3147 vmlsl.u16 q0, d16, d30
3148 vmlal.u16 q0, d17, d30
3150 vld1.32 {d16}, [TMP4], STRIDE
3151 vadd.u16 q12, q12, q13
3152 vld1.32 {d17}, [TMP4]
3154 vmull.u8 q11, d16, d28
3155 vmlal.u8 q11, d17, d29
3156 vst1.32 {d6, d7}, [OUT, :128]!
3157 vshll.u16 q1, d18, #8
3158 vmlsl.u16 q1, d18, d31
3161 /*****************************************************************************/
3163 .set have_bilinear_interpolate_eight_pixels_8888_0565, 1
3165 .macro bilinear_interpolate_eight_pixels_8888_0565_head
3166 mov TMP1, X, asr #16
3168 add TMP1, TOP, TMP1, asl #2
3169 mov TMP2, X, asr #16
3171 add TMP2, TOP, TMP2, asl #2
3172 vld1.32 {d20}, [TMP1], STRIDE
3173 vld1.32 {d21}, [TMP1]
3174 vmull.u8 q8, d20, d28
3175 vmlal.u8 q8, d21, d29
3176 vld1.32 {d22}, [TMP2], STRIDE
3177 vld1.32 {d23}, [TMP2]
3178 vmull.u8 q9, d22, d28
3179 mov TMP3, X, asr #16
3181 add TMP3, TOP, TMP3, asl #2
3182 mov TMP4, X, asr #16
3184 add TMP4, TOP, TMP4, asl #2
3185 vmlal.u8 q9, d23, d29
3186 vld1.32 {d22}, [TMP3], STRIDE
3187 vld1.32 {d23}, [TMP3]
3188 vmull.u8 q10, d22, d28
3189 vmlal.u8 q10, d23, d29
3190 vshll.u16 q0, d16, #8
3191 vmlsl.u16 q0, d16, d30
3192 vmlal.u16 q0, d17, d30
3194 vld1.32 {d16}, [TMP4], STRIDE
3195 vld1.32 {d17}, [TMP4]
3197 vmull.u8 q11, d16, d28
3198 vmlal.u8 q11, d17, d29
3199 vshll.u16 q1, d18, #8
3200 vmlsl.u16 q1, d18, d31
3202 mov TMP1, X, asr #16
3204 add TMP1, TOP, TMP1, asl #2
3205 mov TMP2, X, asr #16
3207 add TMP2, TOP, TMP2, asl #2
3208 vmlal.u16 q1, d19, d31
3209 vshr.u16 q15, q12, #8
3210 vshll.u16 q2, d20, #8
3211 vmlsl.u16 q2, d20, d30
3212 vmlal.u16 q2, d21, d30
3213 vshll.u16 q3, d22, #8
3214 vld1.32 {d20}, [TMP1], STRIDE
3215 vmlsl.u16 q3, d22, d31
3216 vmlal.u16 q3, d23, d31
3217 vld1.32 {d21}, [TMP1]
3218 vmull.u8 q8, d20, d28
3219 vmlal.u8 q8, d21, d29
3220 vshrn.u32 d0, q0, #16
3221 vshrn.u32 d1, q1, #16
3222 vshrn.u32 d4, q2, #16
3223 vld1.32 {d22}, [TMP2], STRIDE
3224 vshrn.u32 d5, q3, #16
3225 vadd.u16 q12, q12, q13
3226 vld1.32 {d23}, [TMP2]
3227 vmull.u8 q9, d22, d28
3228 mov TMP3, X, asr #16
3230 add TMP3, TOP, TMP3, asl #2
3231 mov TMP4, X, asr #16
3233 add TMP4, TOP, TMP4, asl #2
3234 vmlal.u8 q9, d23, d29
3235 vld1.32 {d22}, [TMP3], STRIDE
3236 vshr.u16 q15, q12, #8
3237 vld1.32 {d23}, [TMP3]
3238 vmull.u8 q10, d22, d28
3239 vmlal.u8 q10, d23, d29
3241 vshll.u16 q0, d16, #8
3243 vmlsl.u16 q0, d16, d30
3244 vmlal.u16 q0, d17, d30
3246 vld1.32 {d16}, [TMP4], STRIDE
3247 vadd.u16 q12, q12, q13
3248 vld1.32 {d17}, [TMP4]
3250 vmull.u8 q11, d16, d28
3251 vmlal.u8 q11, d17, d29
3252 vshll.u16 q1, d18, #8
3253 vmlsl.u16 q1, d18, d31
3256 .macro bilinear_interpolate_eight_pixels_8888_0565_tail
3257 vmlal.u16 q1, d19, d31
3258 vshr.u16 q15, q12, #8
3259 vshll.u16 q2, d20, #8
3260 vmlsl.u16 q2, d20, d30
3261 vmlal.u16 q2, d21, d30
3262 vshll.u16 q3, d22, #8
3263 vmlsl.u16 q3, d22, d31
3264 vmlal.u16 q3, d23, d31
3265 vadd.u16 q12, q12, q13
3266 vshrn.u32 d0, q0, #16
3267 vshrn.u32 d1, q1, #16
3268 vshrn.u32 d4, q2, #16
3269 vshr.u16 q15, q12, #8
3270 vshrn.u32 d5, q3, #16
3273 vadd.u16 q12, q12, q13
3280 vshll.u8 q5, d10, #8
3283 vsri.u16 q5, q7, #11
3284 vst1.32 {d10, d11}, [OUT, :128]!
3287 .macro bilinear_interpolate_eight_pixels_8888_0565_tail_head
3288 mov TMP1, X, asr #16
3290 add TMP1, TOP, TMP1, asl #2
3291 mov TMP2, X, asr #16
3293 add TMP2, TOP, TMP2, asl #2
3294 vmlal.u16 q1, d19, d31
3295 vshr.u16 q15, q12, #8
3297 vshll.u16 q2, d20, #8
3298 vmlsl.u16 q2, d20, d30
3299 vmlal.u16 q2, d21, d30
3300 vshll.u16 q3, d22, #8
3301 vld1.32 {d20}, [TMP1], STRIDE
3302 vmlsl.u16 q3, d22, d31
3303 vmlal.u16 q3, d23, d31
3304 vld1.32 {d21}, [TMP1]
3305 vmull.u8 q8, d20, d28
3306 vmlal.u8 q8, d21, d29
3307 vshrn.u32 d0, q0, #16
3308 vshrn.u32 d1, q1, #16
3309 vshrn.u32 d4, q2, #16
3310 vld1.32 {d22}, [TMP2], STRIDE
3311 vshrn.u32 d5, q3, #16
3312 vadd.u16 q12, q12, q13
3313 vld1.32 {d23}, [TMP2]
3314 vmull.u8 q9, d22, d28
3315 mov TMP3, X, asr #16
3317 add TMP3, TOP, TMP3, asl #2
3318 mov TMP4, X, asr #16
3320 add TMP4, TOP, TMP4, asl #2
3321 vmlal.u8 q9, d23, d29
3322 vld1.32 {d22}, [TMP3], STRIDE
3323 vshr.u16 q15, q12, #8
3324 vld1.32 {d23}, [TMP3]
3325 vmull.u8 q10, d22, d28
3326 vmlal.u8 q10, d23, d29
3328 vshll.u16 q0, d16, #8
3330 vmlsl.u16 q0, d16, d30
3331 vmlal.u16 q0, d17, d30
3333 vld1.32 {d16}, [TMP4], STRIDE
3334 vadd.u16 q12, q12, q13
3335 vld1.32 {d17}, [TMP4]
3337 vmull.u8 q11, d16, d28
3338 vmlal.u8 q11, d17, d29
3340 vshll.u16 q1, d18, #8
3341 vmlsl.u16 q1, d18, d31
3343 mov TMP1, X, asr #16
3345 add TMP1, TOP, TMP1, asl #2
3346 mov TMP2, X, asr #16
3348 add TMP2, TOP, TMP2, asl #2
3349 vmlal.u16 q1, d19, d31
3351 vshr.u16 q15, q12, #8
3352 vshll.u16 q2, d20, #8
3354 vmlsl.u16 q2, d20, d30
3355 vmlal.u16 q2, d21, d30
3356 vshll.u16 q3, d22, #8
3357 vld1.32 {d20}, [TMP1], STRIDE
3358 vmlsl.u16 q3, d22, d31
3359 vmlal.u16 q3, d23, d31
3360 vld1.32 {d21}, [TMP1]
3361 vmull.u8 q8, d20, d28
3362 vmlal.u8 q8, d21, d29
3364 vshll.u8 q5, d10, #8
3366 vshrn.u32 d0, q0, #16
3368 vshrn.u32 d1, q1, #16
3369 vsri.u16 q5, q7, #11
3370 vshrn.u32 d4, q2, #16
3371 vld1.32 {d22}, [TMP2], STRIDE
3372 vshrn.u32 d5, q3, #16
3373 vadd.u16 q12, q12, q13
3374 vld1.32 {d23}, [TMP2]
3375 vmull.u8 q9, d22, d28
3376 mov TMP3, X, asr #16
3378 add TMP3, TOP, TMP3, asl #2
3379 mov TMP4, X, asr #16
3381 add TMP4, TOP, TMP4, asl #2
3382 vmlal.u8 q9, d23, d29
3383 vld1.32 {d22}, [TMP3], STRIDE
3384 vshr.u16 q15, q12, #8
3385 vld1.32 {d23}, [TMP3]
3386 vmull.u8 q10, d22, d28
3387 vmlal.u8 q10, d23, d29
3389 vshll.u16 q0, d16, #8
3391 vmlsl.u16 q0, d16, d30
3392 vmlal.u16 q0, d17, d30
3394 vld1.32 {d16}, [TMP4], STRIDE
3395 vadd.u16 q12, q12, q13
3396 vld1.32 {d17}, [TMP4]
3398 vmull.u8 q11, d16, d28
3399 vmlal.u8 q11, d17, d29
3400 vshll.u16 q1, d18, #8
3401 vst1.32 {d10, d11}, [OUT, :128]!
3402 vmlsl.u16 q1, d18, d31
3404 /*****************************************************************************/
3406 generate_bilinear_scanline_func \
3407 pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
3408 2, 2, 28, BILINEAR_FLAG_UNROLL_4
3410 generate_bilinear_scanline_func \
3411 pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
3412 2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
3414 generate_bilinear_scanline_func \
3415 pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
3416 1, 2, 28, BILINEAR_FLAG_UNROLL_4
3418 generate_bilinear_scanline_func \
3419 pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
3420 1, 1, 28, BILINEAR_FLAG_UNROLL_4