2 * Copyright © 2009 Nokia Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
27 * This file contains implementations of NEON optimized pixel processing
28 * functions. There is no full and detailed tutorial, but some functions
29 * (those which are exposing some new or interesting features) are
30 * extensively commented and can be used as examples.
32 * You may want to have a look at the comments for following functions:
33 * - pixman_composite_over_8888_0565_asm_neon
34 * - pixman_composite_over_n_8_0565_asm_neon
37 /* Prevent the stack from becoming executable for no reason... */
38 #if defined(__linux__) && defined(__ELF__)
39 .section .note.GNU-stack,"",%progbits
46 .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
47 .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
52 #include "pixman-arm-neon-asm.h"
54 /* Global configuration options and preferences */
57 * The code can optionally make use of unaligned memory accesses to improve
58 * performance of handling leading/trailing pixels for each scanline.
59 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
60 * example in linux if unaligned memory accesses are not configured to
61 * generate.exceptions.
63 .set RESPECT_STRICT_ALIGNMENT, 1
66 * Set default prefetch type. There is a choice between the following options:
68 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
69 * as NOP to workaround some HW bugs or for whatever other reason)
71 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
72 * advanced prefetch intruduces heavy overhead)
74 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
75 * which can run ARM and NEON instructions simultaneously so that extra ARM
76 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
78 * Note: some types of function can't support advanced prefetch and fallback
79 * to simple one (those which handle 24bpp pixels)
81 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
83 /* Prefetch distance in pixels for simple prefetch */
84 .set PREFETCH_DISTANCE_SIMPLE, 64
87 * Implementation of pixman_composite_over_8888_0565_asm_neon
89 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
90 * performs OVER compositing operation. Function fast_composite_over_8888_0565
91 * from pixman-fast-path.c does the same in C and can be used as a reference.
93 * First we need to have some NEON assembly code which can do the actual
94 * operation on the pixels and provide it to the template macro.
96 * Template macro quite conveniently takes care of emitting all the necessary
97 * code for memory reading and writing (including quite tricky cases of
98 * handling unaligned leading/trailing pixels), so we only need to deal with
99 * the data in NEON registers.
101 * NEON registers allocation in general is recommented to be the following:
102 * d0, d1, d2, d3 - contain loaded source pixel data
103 * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed)
104 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
105 * d28, d29, d30, d31 - place for storing the result (destination pixels)
107 * As can be seen above, four 64-bit NEON registers are used for keeping
108 * intermediate pixel data and up to 8 pixels can be processed in one step
109 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
111 * This particular function uses the following registers allocation:
112 * d0, d1, d2, d3 - contain loaded source pixel data
113 * d4, d5 - contain loaded destination pixels (they are needed)
114 * d28, d29 - place for storing the result (destination pixels)
118 * Step one. We need to have some code to do some arithmetics on pixel data.
119 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
120 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
121 * perform all the needed calculations and write the result to {d28, d29}.
122 * The rationale for having two macros and not just one will be explained
123 * later. In practice, any single monolitic function which does the work can
124 * be split into two parts in any arbitrary way without affecting correctness.
126 * There is one special trick here too. Common template macro can optionally
127 * make our life a bit easier by doing R, G, B, A color components
128 * deinterleaving for 32bpp pixel formats (and this feature is used in
129 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
130 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
131 * actually use d0 register for blue channel (a vector of eight 8-bit
132 * values), d1 register for green, d2 for red and d3 for alpha. This
133 * simple conversion can be also done with a few NEON instructions:
135 * Packed to planar conversion:
141 * Planar to packed conversion:
147 * But pixel can be loaded directly in planar format using VLD4.8 NEON
148 * instruction. It is 1 cycle slower than VLD1.32, so this is not always
149 * desirable, that's why deinterleaving is optional.
151 * But anyway, here is the code:
153 .macro pixman_composite_over_8888_0565_process_pixblock_head
154 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
155 and put data into d6 - red, d7 - green, d30 - blue */
160 vmvn.8 d3, d3 /* invert source alpha */
162 vshrn.u16 d30, q2, #2
163 /* now do alpha blending, storing results in 8-bit planar format
164 into d16 - red, d19 - green, d18 - blue */
167 vmull.u8 q12, d3, d30
168 vrshr.u16 q13, q10, #8
169 vrshr.u16 q3, q11, #8
170 vrshr.u16 q15, q12, #8
171 vraddhn.u16 d20, q10, q13
172 vraddhn.u16 d23, q11, q3
173 vraddhn.u16 d22, q12, q15
176 .macro pixman_composite_over_8888_0565_process_pixblock_tail
177 /* ... continue alpha blending */
178 vqadd.u8 d16, d2, d20
180 /* convert the result to r5g6b5 and store it into {d28, d29} */
181 vshll.u8 q14, d16, #8
185 vsri.u16 q14, q9, #11
189 * OK, now we got almost everything that we need. Using the above two
190 * macros, the work can be done right. But now we want to optimize
191 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
192 * a lot from good code scheduling and software pipelining.
194 * Let's construct some code, which will run in the core main loop.
195 * Some pseudo-code of the main loop will look like this:
203 * It may look a bit weird, but this setup allows to hide instruction
204 * latencies better and also utilize dual-issue capability more
205 * efficiently (make pairs of load-store and ALU instructions).
207 * So what we need now is a '*_tail_head' macro, which will be used
208 * in the core main loop. A trivial straightforward implementation
209 * of this macro would look like this:
211 * pixman_composite_over_8888_0565_process_pixblock_tail
212 * vst1.16 {d28, d29}, [DST_W, :128]!
213 * vld1.16 {d4, d5}, [DST_R, :128]!
214 * vld4.32 {d0, d1, d2, d3}, [SRC]!
215 * pixman_composite_over_8888_0565_process_pixblock_head
218 * Now it also got some VLD/VST instructions. We simply can't move from
219 * processing one block of pixels to the other one with just arithmetics.
220 * The previously processed data needs to be written to memory and new
221 * data needs to be fetched. Fortunately, this main loop does not deal
222 * with partial leading/trailing pixels and can load/store a full block
223 * of pixels in a bulk. Additionally, destination buffer is already
224 * 16 bytes aligned here (which is good for performance).
226 * New things here are DST_R, DST_W, SRC and MASK identifiers. These
227 * are the aliases for ARM registers which are used as pointers for
228 * accessing data. We maintain separate pointers for reading and writing
229 * destination buffer (DST_R and DST_W).
231 * Another new thing is 'cache_preload' macro. It is used for prefetching
232 * data into CPU L2 cache and improve performance when dealing with large
233 * images which are far larger than cache size. It uses one argument
234 * (actually two, but they need to be the same here) - number of pixels
235 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
236 * details about this macro. Moreover, if good performance is needed
237 * the code from this macro needs to be copied into '*_tail_head' macro
238 * and mixed with the rest of code for optimal instructions scheduling.
239 * We are actually doing it below.
241 * Now after all the explanations, here is the optimized code.
242 * Different instruction streams (originaling from '*_head', '*_tail'
243 * and 'cache_preload' macro) use different indentation levels for
244 * better readability. Actually taking the code from one of these
245 * indentation levels and ignoring a few VLD/VST instructions would
246 * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
252 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
253 vqadd.u8 d16, d2, d20
254 vld1.16 {d4, d5}, [DST_R, :128]!
260 vshll.u8 q14, d16, #8
261 PF add PF_X, PF_X, #8
265 PF addne PF_X, PF_X, #8
267 PF subne PF_CTL, PF_CTL, #1
269 vshrn.u16 d30, q2, #2
271 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
273 vmull.u8 q12, d3, d30
274 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
278 vrshr.u16 q13, q10, #8
279 PF subge PF_X, PF_X, ORIG_W
280 vrshr.u16 q3, q11, #8
281 vrshr.u16 q15, q12, #8
282 PF subges PF_CTL, PF_CTL, #0x10
283 vsri.u16 q14, q9, #11
284 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
285 vraddhn.u16 d20, q10, q13
286 vraddhn.u16 d23, q11, q3
287 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
288 vraddhn.u16 d22, q12, q15
289 vst1.16 {d28, d29}, [DST_W, :128]!
294 /* If we did not care much about the performance, we would just use this... */
295 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
296 pixman_composite_over_8888_0565_process_pixblock_tail
297 vst1.16 {d28, d29}, [DST_W, :128]!
298 vld1.16 {d4, d5}, [DST_R, :128]!
300 pixman_composite_over_8888_0565_process_pixblock_head
307 * And now the final part. We are using 'generate_composite_function' macro
308 * to put all the stuff together. We are specifying the name of the function
309 * which we want to get, number of bits per pixel for the source, mask and
310 * destination (0 if unused, like mask in this case). Next come some bit
312 * FLAG_DST_READWRITE - tells that the destination buffer is both read
313 * and written, for write-only buffer we would use
314 * FLAG_DST_WRITEONLY flag instead
315 * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
316 * and separate color channels for 32bpp format.
317 * The next things are:
318 * - the number of pixels processed per iteration (8 in this case, because
319 * that's the maximum what can fit into four 64-bit NEON registers).
320 * - prefetch distance, measured in pixel blocks. In this case it is 5 times
321 * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
322 * prefetch distance can be selected by running some benchmarks.
324 * After that we specify some macros, these are 'default_init',
325 * 'default_cleanup' here which are empty (but it is possible to have custom
326 * init/cleanup macros to be able to save/restore some extra NEON registers
327 * like d8-d15 or do anything else) followed by
328 * 'pixman_composite_over_8888_0565_process_pixblock_head',
329 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
330 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
331 * which we got implemented above.
333 * The last part is the NEON registers allocation scheme.
335 generate_composite_function \
336 pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
337 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
338 8, /* number of pixels, processed in a single block */ \
339 5, /* prefetch distance */ \
342 pixman_composite_over_8888_0565_process_pixblock_head, \
343 pixman_composite_over_8888_0565_process_pixblock_tail, \
344 pixman_composite_over_8888_0565_process_pixblock_tail_head, \
345 28, /* dst_w_basereg */ \
346 4, /* dst_r_basereg */ \
347 0, /* src_basereg */ \
348 24 /* mask_basereg */
350 /******************************************************************************/
352 .macro pixman_composite_over_n_0565_process_pixblock_head
353 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
354 and put data into d6 - red, d7 - green, d30 - blue */
360 vshrn.u16 d30, q2, #2
361 /* now do alpha blending, storing results in 8-bit planar format
362 into d16 - red, d19 - green, d18 - blue */
365 vmull.u8 q12, d3, d30
366 vrshr.u16 q13, q10, #8
367 vrshr.u16 q3, q11, #8
368 vrshr.u16 q15, q12, #8
369 vraddhn.u16 d20, q10, q13
370 vraddhn.u16 d23, q11, q3
371 vraddhn.u16 d22, q12, q15
374 .macro pixman_composite_over_n_0565_process_pixblock_tail
375 /* ... continue alpha blending */
376 vqadd.u8 d16, d2, d20
378 /* convert the result to r5g6b5 and store it into {d28, d29} */
379 vshll.u8 q14, d16, #8
383 vsri.u16 q14, q9, #11
386 /* TODO: expand macros and do better instructions scheduling */
387 .macro pixman_composite_over_n_0565_process_pixblock_tail_head
388 pixman_composite_over_n_0565_process_pixblock_tail
389 vld1.16 {d4, d5}, [DST_R, :128]!
390 vst1.16 {d28, d29}, [DST_W, :128]!
391 pixman_composite_over_n_0565_process_pixblock_head
395 .macro pixman_composite_over_n_0565_init
396 add DUMMY, sp, #ARGS_STACK_OFFSET
397 vld1.32 {d3[0]}, [DUMMY]
402 vmvn.8 d3, d3 /* invert source alpha */
405 generate_composite_function \
406 pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
407 FLAG_DST_READWRITE, \
408 8, /* number of pixels, processed in a single block */ \
409 5, /* prefetch distance */ \
410 pixman_composite_over_n_0565_init, \
412 pixman_composite_over_n_0565_process_pixblock_head, \
413 pixman_composite_over_n_0565_process_pixblock_tail, \
414 pixman_composite_over_n_0565_process_pixblock_tail_head, \
415 28, /* dst_w_basereg */ \
416 4, /* dst_r_basereg */ \
417 0, /* src_basereg */ \
418 24 /* mask_basereg */
420 /******************************************************************************/
422 .macro pixman_composite_src_8888_0565_process_pixblock_head
428 .macro pixman_composite_src_8888_0565_process_pixblock_tail
430 vsri.u16 q14, q9, #11
433 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
435 PF add PF_X, PF_X, #8
438 PF addne PF_X, PF_X, #8
439 PF subne PF_CTL, PF_CTL, #1
440 vsri.u16 q14, q9, #11
442 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
444 vst1.16 {d28, d29}, [DST_W, :128]!
445 PF subge PF_X, PF_X, ORIG_W
446 PF subges PF_CTL, PF_CTL, #0x10
448 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
452 generate_composite_function \
453 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
454 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
455 8, /* number of pixels, processed in a single block */ \
456 10, /* prefetch distance */ \
459 pixman_composite_src_8888_0565_process_pixblock_head, \
460 pixman_composite_src_8888_0565_process_pixblock_tail, \
461 pixman_composite_src_8888_0565_process_pixblock_tail_head
463 /******************************************************************************/
465 .macro pixman_composite_src_0565_8888_process_pixblock_head
466 vshrn.u16 d30, q0, #8
467 vshrn.u16 d29, q0, #3
472 vshrn.u16 d28, q0, #2
475 .macro pixman_composite_src_0565_8888_process_pixblock_tail
478 /* TODO: expand macros and do better instructions scheduling */
479 .macro pixman_composite_src_0565_8888_process_pixblock_tail_head
480 pixman_composite_src_0565_8888_process_pixblock_tail
481 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
483 pixman_composite_src_0565_8888_process_pixblock_head
487 generate_composite_function \
488 pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
489 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
490 8, /* number of pixels, processed in a single block */ \
491 10, /* prefetch distance */ \
494 pixman_composite_src_0565_8888_process_pixblock_head, \
495 pixman_composite_src_0565_8888_process_pixblock_tail, \
496 pixman_composite_src_0565_8888_process_pixblock_tail_head
498 /******************************************************************************/
500 .macro pixman_composite_add_8_8_process_pixblock_head
505 .macro pixman_composite_add_8_8_process_pixblock_tail
508 .macro pixman_composite_add_8_8_process_pixblock_tail_head
510 PF add PF_X, PF_X, #32
512 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
513 PF addne PF_X, PF_X, #32
514 PF subne PF_CTL, PF_CTL, #1
515 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
517 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
518 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
519 PF subge PF_X, PF_X, ORIG_W
520 PF subges PF_CTL, PF_CTL, #0x10
522 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
523 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
527 generate_composite_function \
528 pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
529 FLAG_DST_READWRITE, \
530 32, /* number of pixels, processed in a single block */ \
531 10, /* prefetch distance */ \
534 pixman_composite_add_8_8_process_pixblock_head, \
535 pixman_composite_add_8_8_process_pixblock_tail, \
536 pixman_composite_add_8_8_process_pixblock_tail_head
538 /******************************************************************************/
540 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
542 PF add PF_X, PF_X, #8
544 vld1.32 {d4, d5, d6, d7}, [DST_R, :128]!
545 PF addne PF_X, PF_X, #8
546 PF subne PF_CTL, PF_CTL, #1
547 vst1.32 {d28, d29, d30, d31}, [DST_W, :128]!
549 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
550 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
551 PF subge PF_X, PF_X, ORIG_W
552 PF subges PF_CTL, PF_CTL, #0x10
554 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
555 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
559 generate_composite_function \
560 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
561 FLAG_DST_READWRITE, \
562 8, /* number of pixels, processed in a single block */ \
563 10, /* prefetch distance */ \
566 pixman_composite_add_8_8_process_pixblock_head, \
567 pixman_composite_add_8_8_process_pixblock_tail, \
568 pixman_composite_add_8888_8888_process_pixblock_tail_head
570 generate_composite_function_single_scanline \
571 pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
572 FLAG_DST_READWRITE, \
573 8, /* number of pixels, processed in a single block */ \
576 pixman_composite_add_8_8_process_pixblock_head, \
577 pixman_composite_add_8_8_process_pixblock_tail, \
578 pixman_composite_add_8888_8888_process_pixblock_tail_head
580 /******************************************************************************/
582 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
583 vmvn.8 d24, d3 /* get inverted alpha */
584 /* do alpha blending */
587 vmull.u8 q10, d24, d6
588 vmull.u8 q11, d24, d7
591 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
592 vrshr.u16 q14, q8, #8
593 vrshr.u16 q15, q9, #8
594 vrshr.u16 q12, q10, #8
595 vrshr.u16 q13, q11, #8
596 vraddhn.u16 d28, q14, q8
597 vraddhn.u16 d29, q15, q9
598 vraddhn.u16 d30, q12, q10
599 vraddhn.u16 d31, q13, q11
602 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
603 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
604 vrshr.u16 q14, q8, #8
605 PF add PF_X, PF_X, #8
607 vrshr.u16 q15, q9, #8
608 vrshr.u16 q12, q10, #8
609 vrshr.u16 q13, q11, #8
610 PF addne PF_X, PF_X, #8
611 PF subne PF_CTL, PF_CTL, #1
612 vraddhn.u16 d28, q14, q8
613 vraddhn.u16 d29, q15, q9
615 vraddhn.u16 d30, q12, q10
616 vraddhn.u16 d31, q13, q11
618 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
620 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
621 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
622 PF subge PF_X, PF_X, ORIG_W
624 PF subges PF_CTL, PF_CTL, #0x10
626 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
627 vmull.u8 q10, d22, d6
628 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
629 vmull.u8 q11, d22, d7
632 generate_composite_function_single_scanline \
633 pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
634 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
635 8, /* number of pixels, processed in a single block */ \
638 pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
639 pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
640 pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
642 /******************************************************************************/
644 .macro pixman_composite_over_8888_8888_process_pixblock_head
645 pixman_composite_out_reverse_8888_8888_process_pixblock_head
648 .macro pixman_composite_over_8888_8888_process_pixblock_tail
649 pixman_composite_out_reverse_8888_8888_process_pixblock_tail
650 vqadd.u8 q14, q0, q14
651 vqadd.u8 q15, q1, q15
654 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
655 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
656 vrshr.u16 q14, q8, #8
657 PF add PF_X, PF_X, #8
659 vrshr.u16 q15, q9, #8
660 vrshr.u16 q12, q10, #8
661 vrshr.u16 q13, q11, #8
662 PF addne PF_X, PF_X, #8
663 PF subne PF_CTL, PF_CTL, #1
664 vraddhn.u16 d28, q14, q8
665 vraddhn.u16 d29, q15, q9
667 vraddhn.u16 d30, q12, q10
668 vraddhn.u16 d31, q13, q11
669 vqadd.u8 q14, q0, q14
670 vqadd.u8 q15, q1, q15
672 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
674 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
675 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
676 PF subge PF_X, PF_X, ORIG_W
678 PF subges PF_CTL, PF_CTL, #0x10
680 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
681 vmull.u8 q10, d22, d6
682 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
683 vmull.u8 q11, d22, d7
686 generate_composite_function \
687 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
688 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
689 8, /* number of pixels, processed in a single block */ \
690 5, /* prefetch distance */ \
693 pixman_composite_over_8888_8888_process_pixblock_head, \
694 pixman_composite_over_8888_8888_process_pixblock_tail, \
695 pixman_composite_over_8888_8888_process_pixblock_tail_head
697 generate_composite_function_single_scanline \
698 pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
699 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
700 8, /* number of pixels, processed in a single block */ \
703 pixman_composite_over_8888_8888_process_pixblock_head, \
704 pixman_composite_over_8888_8888_process_pixblock_tail, \
705 pixman_composite_over_8888_8888_process_pixblock_tail_head
707 /******************************************************************************/
709 .macro pixman_composite_over_n_8888_process_pixblock_head
710 /* deinterleaved source pixels in {d0, d1, d2, d3} */
711 /* inverted alpha in {d24} */
712 /* destination pixels in {d4, d5, d6, d7} */
715 vmull.u8 q10, d24, d6
716 vmull.u8 q11, d24, d7
719 .macro pixman_composite_over_n_8888_process_pixblock_tail
720 vrshr.u16 q14, q8, #8
721 vrshr.u16 q15, q9, #8
722 vrshr.u16 q2, q10, #8
723 vrshr.u16 q3, q11, #8
724 vraddhn.u16 d28, q14, q8
725 vraddhn.u16 d29, q15, q9
726 vraddhn.u16 d30, q2, q10
727 vraddhn.u16 d31, q3, q11
728 vqadd.u8 q14, q0, q14
729 vqadd.u8 q15, q1, q15
732 .macro pixman_composite_over_n_8888_process_pixblock_tail_head
733 vrshr.u16 q14, q8, #8
734 vrshr.u16 q15, q9, #8
735 vrshr.u16 q2, q10, #8
736 vrshr.u16 q3, q11, #8
737 vraddhn.u16 d28, q14, q8
738 vraddhn.u16 d29, q15, q9
739 vraddhn.u16 d30, q2, q10
740 vraddhn.u16 d31, q3, q11
741 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
742 vqadd.u8 q14, q0, q14
743 PF add PF_X, PF_X, #8
745 PF addne PF_X, PF_X, #8
746 PF subne PF_CTL, PF_CTL, #1
747 vqadd.u8 q15, q1, q15
750 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
752 PF subge PF_X, PF_X, ORIG_W
753 vmull.u8 q10, d24, d6
754 PF subges PF_CTL, PF_CTL, #0x10
755 vmull.u8 q11, d24, d7
756 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
757 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
760 .macro pixman_composite_over_n_8888_init
761 add DUMMY, sp, #ARGS_STACK_OFFSET
762 vld1.32 {d3[0]}, [DUMMY]
767 vmvn.8 d24, d3 /* get inverted alpha */
770 generate_composite_function \
771 pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
772 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
773 8, /* number of pixels, processed in a single block */ \
774 5, /* prefetch distance */ \
775 pixman_composite_over_n_8888_init, \
777 pixman_composite_over_8888_8888_process_pixblock_head, \
778 pixman_composite_over_8888_8888_process_pixblock_tail, \
779 pixman_composite_over_n_8888_process_pixblock_tail_head
781 /******************************************************************************/
783 .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
784 vrshr.u16 q14, q8, #8
785 PF add PF_X, PF_X, #8
787 vrshr.u16 q15, q9, #8
788 vrshr.u16 q12, q10, #8
789 vrshr.u16 q13, q11, #8
790 PF addne PF_X, PF_X, #8
791 PF subne PF_CTL, PF_CTL, #1
792 vraddhn.u16 d28, q14, q8
793 vraddhn.u16 d29, q15, q9
795 vraddhn.u16 d30, q12, q10
796 vraddhn.u16 d31, q13, q11
797 vqadd.u8 q14, q0, q14
798 vqadd.u8 q15, q1, q15
799 vld4.8 {d0, d1, d2, d3}, [DST_R, :128]!
801 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
802 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
803 PF subge PF_X, PF_X, ORIG_W
805 PF subges PF_CTL, PF_CTL, #0x10
807 vmull.u8 q10, d22, d6
808 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
809 vmull.u8 q11, d22, d7
812 .macro pixman_composite_over_reverse_n_8888_init
813 add DUMMY, sp, #ARGS_STACK_OFFSET
814 vld1.32 {d7[0]}, [DUMMY]
821 generate_composite_function \
822 pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
823 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
824 8, /* number of pixels, processed in a single block */ \
825 5, /* prefetch distance */ \
826 pixman_composite_over_reverse_n_8888_init, \
828 pixman_composite_over_8888_8888_process_pixblock_head, \
829 pixman_composite_over_8888_8888_process_pixblock_tail, \
830 pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
831 28, /* dst_w_basereg */ \
832 0, /* dst_r_basereg */ \
833 4, /* src_basereg */ \
834 24 /* mask_basereg */
836 /******************************************************************************/
838 .macro pixman_composite_over_8888_8_0565_process_pixblock_head
839 vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */
841 vmull.u8 q6, d24, d10
842 vmull.u8 q7, d24, d11
843 vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */
846 vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */
848 vrshr.u16 q10, q6, #8
849 vrshr.u16 q11, q7, #8
850 vraddhn.u16 d0, q0, q8
851 vraddhn.u16 d1, q1, q9
852 vraddhn.u16 d2, q6, q10
853 vraddhn.u16 d3, q7, q11
854 vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */
857 vshrn.u16 d30, q2, #2
858 vmull.u8 q8, d3, d6 /* now do alpha blending */
860 vmull.u8 q10, d3, d30
863 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail
864 /* 3 cycle bubble (after vmull.u8) */
865 vrshr.u16 q13, q8, #8
866 vrshr.u16 q11, q9, #8
867 vrshr.u16 q15, q10, #8
868 vraddhn.u16 d16, q8, q13
869 vraddhn.u16 d27, q9, q11
870 vraddhn.u16 d26, q10, q15
871 vqadd.u8 d16, d2, d16
874 vshll.u8 q14, d16, #8 /* convert to 16bpp */
879 vsri.u16 q14, q9, #11
882 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
883 vld1.16 {d4, d5}, [DST_R, :128]!
888 vmull.u8 q6, d24, d10
889 vrshr.u16 q13, q8, #8
890 vrshr.u16 q11, q9, #8
891 vrshr.u16 q15, q10, #8
892 vraddhn.u16 d16, q8, q13
893 vraddhn.u16 d27, q9, q11
894 vraddhn.u16 d26, q10, q15
895 vqadd.u8 d16, d2, d16
898 vshll.u8 q14, d16, #8
903 vmull.u8 q7, d24, d11
904 vsri.u16 q14, q9, #11
911 vrshr.u16 q10, q6, #8
912 vrshr.u16 q11, q7, #8
913 vraddhn.u16 d0, q0, q8
914 vraddhn.u16 d1, q1, q9
915 vraddhn.u16 d2, q6, q10
916 vraddhn.u16 d3, q7, q11
920 vshrn.u16 d30, q2, #2
921 vst1.16 {d28, d29}, [DST_W, :128]!
924 vmull.u8 q10, d3, d30
927 generate_composite_function \
928 pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
929 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
930 8, /* number of pixels, processed in a single block */ \
931 5, /* prefetch distance */ \
932 default_init_need_all_regs, \
933 default_cleanup_need_all_regs, \
934 pixman_composite_over_8888_8_0565_process_pixblock_head, \
935 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
936 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
937 28, /* dst_w_basereg */ \
938 4, /* dst_r_basereg */ \
939 8, /* src_basereg */ \
940 24 /* mask_basereg */
942 /******************************************************************************/
945 * This function needs a special initialization of solid mask.
946 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
947 * offset, split into color components and replicated in d8-d11
948 * registers. Additionally, this function needs all the NEON registers,
949 * so it has to save d8-d15 registers which are callee saved according
950 * to ABI. These registers are restored from 'cleanup' macro. All the
951 * other NEON registers are caller saved, so can be clobbered freely
952 * without introducing any problems.
954 .macro pixman_composite_over_n_8_0565_init
955 add DUMMY, sp, #ARGS_STACK_OFFSET
957 vld1.32 {d11[0]}, [DUMMY]
964 .macro pixman_composite_over_n_8_0565_cleanup
968 generate_composite_function \
969 pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
970 FLAG_DST_READWRITE, \
971 8, /* number of pixels, processed in a single block */ \
972 5, /* prefetch distance */ \
973 pixman_composite_over_n_8_0565_init, \
974 pixman_composite_over_n_8_0565_cleanup, \
975 pixman_composite_over_8888_8_0565_process_pixblock_head, \
976 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
977 pixman_composite_over_8888_8_0565_process_pixblock_tail_head
979 /******************************************************************************/
981 .macro pixman_composite_over_8888_n_0565_init
982 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
984 vld1.32 {d24[0]}, [DUMMY]
988 .macro pixman_composite_over_8888_n_0565_cleanup
992 generate_composite_function \
993 pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
994 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
995 8, /* number of pixels, processed in a single block */ \
996 5, /* prefetch distance */ \
997 pixman_composite_over_8888_n_0565_init, \
998 pixman_composite_over_8888_n_0565_cleanup, \
999 pixman_composite_over_8888_8_0565_process_pixblock_head, \
1000 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
1001 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
1002 28, /* dst_w_basereg */ \
1003 4, /* dst_r_basereg */ \
1004 8, /* src_basereg */ \
1005 24 /* mask_basereg */
1007 /******************************************************************************/
1009 .macro pixman_composite_src_0565_0565_process_pixblock_head
1012 .macro pixman_composite_src_0565_0565_process_pixblock_tail
1015 .macro pixman_composite_src_0565_0565_process_pixblock_tail_head
1016 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
1018 cache_preload 16, 16
1021 generate_composite_function \
1022 pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
1023 FLAG_DST_WRITEONLY, \
1024 16, /* number of pixels, processed in a single block */ \
1025 10, /* prefetch distance */ \
1028 pixman_composite_src_0565_0565_process_pixblock_head, \
1029 pixman_composite_src_0565_0565_process_pixblock_tail, \
1030 pixman_composite_src_0565_0565_process_pixblock_tail_head, \
1031 0, /* dst_w_basereg */ \
1032 0, /* dst_r_basereg */ \
1033 0, /* src_basereg */ \
1034 0 /* mask_basereg */
1036 /******************************************************************************/
1038 .macro pixman_composite_src_n_8_process_pixblock_head
1041 .macro pixman_composite_src_n_8_process_pixblock_tail
1044 .macro pixman_composite_src_n_8_process_pixblock_tail_head
1045 vst1.8 {d0, d1, d2, d3}, [DST_W, :128]!
1048 .macro pixman_composite_src_n_8_init
1049 add DUMMY, sp, #ARGS_STACK_OFFSET
1050 vld1.32 {d0[0]}, [DUMMY]
1052 vsli.u64 d0, d0, #16
1053 vsli.u64 d0, d0, #32
1058 .macro pixman_composite_src_n_8_cleanup
1061 generate_composite_function \
1062 pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
1063 FLAG_DST_WRITEONLY, \
1064 32, /* number of pixels, processed in a single block */ \
1065 0, /* prefetch distance */ \
1066 pixman_composite_src_n_8_init, \
1067 pixman_composite_src_n_8_cleanup, \
1068 pixman_composite_src_n_8_process_pixblock_head, \
1069 pixman_composite_src_n_8_process_pixblock_tail, \
1070 pixman_composite_src_n_8_process_pixblock_tail_head, \
1071 0, /* dst_w_basereg */ \
1072 0, /* dst_r_basereg */ \
1073 0, /* src_basereg */ \
1074 0 /* mask_basereg */
1076 /******************************************************************************/
1078 .macro pixman_composite_src_n_0565_process_pixblock_head
1081 .macro pixman_composite_src_n_0565_process_pixblock_tail
1084 .macro pixman_composite_src_n_0565_process_pixblock_tail_head
1085 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
1088 .macro pixman_composite_src_n_0565_init
1089 add DUMMY, sp, #ARGS_STACK_OFFSET
1090 vld1.32 {d0[0]}, [DUMMY]
1091 vsli.u64 d0, d0, #16
1092 vsli.u64 d0, d0, #32
1097 .macro pixman_composite_src_n_0565_cleanup
1100 generate_composite_function \
1101 pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
1102 FLAG_DST_WRITEONLY, \
1103 16, /* number of pixels, processed in a single block */ \
1104 0, /* prefetch distance */ \
1105 pixman_composite_src_n_0565_init, \
1106 pixman_composite_src_n_0565_cleanup, \
1107 pixman_composite_src_n_0565_process_pixblock_head, \
1108 pixman_composite_src_n_0565_process_pixblock_tail, \
1109 pixman_composite_src_n_0565_process_pixblock_tail_head, \
1110 0, /* dst_w_basereg */ \
1111 0, /* dst_r_basereg */ \
1112 0, /* src_basereg */ \
1113 0 /* mask_basereg */
1115 /******************************************************************************/
1117 .macro pixman_composite_src_n_8888_process_pixblock_head
1120 .macro pixman_composite_src_n_8888_process_pixblock_tail
1123 .macro pixman_composite_src_n_8888_process_pixblock_tail_head
1124 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1127 .macro pixman_composite_src_n_8888_init
1128 add DUMMY, sp, #ARGS_STACK_OFFSET
1129 vld1.32 {d0[0]}, [DUMMY]
1130 vsli.u64 d0, d0, #32
1135 .macro pixman_composite_src_n_8888_cleanup
1138 generate_composite_function \
1139 pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
1140 FLAG_DST_WRITEONLY, \
1141 8, /* number of pixels, processed in a single block */ \
1142 0, /* prefetch distance */ \
1143 pixman_composite_src_n_8888_init, \
1144 pixman_composite_src_n_8888_cleanup, \
1145 pixman_composite_src_n_8888_process_pixblock_head, \
1146 pixman_composite_src_n_8888_process_pixblock_tail, \
1147 pixman_composite_src_n_8888_process_pixblock_tail_head, \
1148 0, /* dst_w_basereg */ \
1149 0, /* dst_r_basereg */ \
1150 0, /* src_basereg */ \
1151 0 /* mask_basereg */
1153 /******************************************************************************/
1155 .macro pixman_composite_src_8888_8888_process_pixblock_head
1158 .macro pixman_composite_src_8888_8888_process_pixblock_tail
1161 .macro pixman_composite_src_8888_8888_process_pixblock_tail_head
1162 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1167 generate_composite_function \
1168 pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
1169 FLAG_DST_WRITEONLY, \
1170 8, /* number of pixels, processed in a single block */ \
1171 10, /* prefetch distance */ \
1174 pixman_composite_src_8888_8888_process_pixblock_head, \
1175 pixman_composite_src_8888_8888_process_pixblock_tail, \
1176 pixman_composite_src_8888_8888_process_pixblock_tail_head, \
1177 0, /* dst_w_basereg */ \
1178 0, /* dst_r_basereg */ \
1179 0, /* src_basereg */ \
1180 0 /* mask_basereg */
1182 /******************************************************************************/
1184 .macro pixman_composite_src_x888_8888_process_pixblock_head
1189 .macro pixman_composite_src_x888_8888_process_pixblock_tail
1192 .macro pixman_composite_src_x888_8888_process_pixblock_tail_head
1193 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1200 .macro pixman_composite_src_x888_8888_init
1202 vshl.u32 q2, q2, #24
1205 generate_composite_function \
1206 pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
1207 FLAG_DST_WRITEONLY, \
1208 8, /* number of pixels, processed in a single block */ \
1209 10, /* prefetch distance */ \
1210 pixman_composite_src_x888_8888_init, \
1212 pixman_composite_src_x888_8888_process_pixblock_head, \
1213 pixman_composite_src_x888_8888_process_pixblock_tail, \
1214 pixman_composite_src_x888_8888_process_pixblock_tail_head, \
1215 0, /* dst_w_basereg */ \
1216 0, /* dst_r_basereg */ \
1217 0, /* src_basereg */ \
1218 0 /* mask_basereg */
1220 /******************************************************************************/
1222 .macro pixman_composite_src_n_8_8888_process_pixblock_head
1223 /* expecting solid source in {d0, d1, d2, d3} */
1224 /* mask is in d24 (d25, d26, d27 are unused) */
1227 vmull.u8 q8, d24, d0
1228 vmull.u8 q9, d24, d1
1229 vmull.u8 q10, d24, d2
1230 vmull.u8 q11, d24, d3
1231 vrsra.u16 q8, q8, #8
1232 vrsra.u16 q9, q9, #8
1233 vrsra.u16 q10, q10, #8
1234 vrsra.u16 q11, q11, #8
1237 .macro pixman_composite_src_n_8_8888_process_pixblock_tail
1238 vrshrn.u16 d28, q8, #8
1239 vrshrn.u16 d29, q9, #8
1240 vrshrn.u16 d30, q10, #8
1241 vrshrn.u16 d31, q11, #8
1244 .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
1246 PF add PF_X, PF_X, #8
1247 vrshrn.u16 d28, q8, #8
1248 PF tst PF_CTL, #0x0F
1249 vrshrn.u16 d29, q9, #8
1250 PF addne PF_X, PF_X, #8
1251 vrshrn.u16 d30, q10, #8
1252 PF subne PF_CTL, PF_CTL, #1
1253 vrshrn.u16 d31, q11, #8
1255 vmull.u8 q8, d24, d0
1256 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1257 vmull.u8 q9, d24, d1
1258 PF subge PF_X, PF_X, ORIG_W
1259 vmull.u8 q10, d24, d2
1260 PF subges PF_CTL, PF_CTL, #0x10
1261 vmull.u8 q11, d24, d3
1262 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1263 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1264 vrsra.u16 q8, q8, #8
1265 vrsra.u16 q9, q9, #8
1266 vrsra.u16 q10, q10, #8
1267 vrsra.u16 q11, q11, #8
1270 .macro pixman_composite_src_n_8_8888_init
1271 add DUMMY, sp, #ARGS_STACK_OFFSET
1272 vld1.32 {d3[0]}, [DUMMY]
1279 .macro pixman_composite_src_n_8_8888_cleanup
1282 generate_composite_function \
1283 pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
1284 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1285 8, /* number of pixels, processed in a single block */ \
1286 5, /* prefetch distance */ \
1287 pixman_composite_src_n_8_8888_init, \
1288 pixman_composite_src_n_8_8888_cleanup, \
1289 pixman_composite_src_n_8_8888_process_pixblock_head, \
1290 pixman_composite_src_n_8_8888_process_pixblock_tail, \
1291 pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
1293 /******************************************************************************/
1295 .macro pixman_composite_src_n_8_8_process_pixblock_head
1296 vmull.u8 q0, d24, d16
1297 vmull.u8 q1, d25, d16
1298 vmull.u8 q2, d26, d16
1299 vmull.u8 q3, d27, d16
1300 vrsra.u16 q0, q0, #8
1301 vrsra.u16 q1, q1, #8
1302 vrsra.u16 q2, q2, #8
1303 vrsra.u16 q3, q3, #8
1306 .macro pixman_composite_src_n_8_8_process_pixblock_tail
1307 vrshrn.u16 d28, q0, #8
1308 vrshrn.u16 d29, q1, #8
1309 vrshrn.u16 d30, q2, #8
1310 vrshrn.u16 d31, q3, #8
1313 .macro pixman_composite_src_n_8_8_process_pixblock_tail_head
1315 PF add PF_X, PF_X, #8
1316 vrshrn.u16 d28, q0, #8
1317 PF tst PF_CTL, #0x0F
1318 vrshrn.u16 d29, q1, #8
1319 PF addne PF_X, PF_X, #8
1320 vrshrn.u16 d30, q2, #8
1321 PF subne PF_CTL, PF_CTL, #1
1322 vrshrn.u16 d31, q3, #8
1324 vmull.u8 q0, d24, d16
1325 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1326 vmull.u8 q1, d25, d16
1327 PF subge PF_X, PF_X, ORIG_W
1328 vmull.u8 q2, d26, d16
1329 PF subges PF_CTL, PF_CTL, #0x10
1330 vmull.u8 q3, d27, d16
1331 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1332 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1333 vrsra.u16 q0, q0, #8
1334 vrsra.u16 q1, q1, #8
1335 vrsra.u16 q2, q2, #8
1336 vrsra.u16 q3, q3, #8
1339 .macro pixman_composite_src_n_8_8_init
1340 add DUMMY, sp, #ARGS_STACK_OFFSET
1341 vld1.32 {d16[0]}, [DUMMY]
1345 .macro pixman_composite_src_n_8_8_cleanup
1348 generate_composite_function \
1349 pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
1350 FLAG_DST_WRITEONLY, \
1351 32, /* number of pixels, processed in a single block */ \
1352 5, /* prefetch distance */ \
1353 pixman_composite_src_n_8_8_init, \
1354 pixman_composite_src_n_8_8_cleanup, \
1355 pixman_composite_src_n_8_8_process_pixblock_head, \
1356 pixman_composite_src_n_8_8_process_pixblock_tail, \
1357 pixman_composite_src_n_8_8_process_pixblock_tail_head
1359 /******************************************************************************/
1361 .macro pixman_composite_over_n_8_8888_process_pixblock_head
1362 /* expecting deinterleaved source data in {d8, d9, d10, d11} */
1363 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1364 /* and destination data in {d4, d5, d6, d7} */
1365 /* mask is in d24 (d25, d26, d27 are unused) */
1368 vmull.u8 q6, d24, d8
1369 vmull.u8 q7, d24, d9
1370 vmull.u8 q8, d24, d10
1371 vmull.u8 q9, d24, d11
1372 vrshr.u16 q10, q6, #8
1373 vrshr.u16 q11, q7, #8
1374 vrshr.u16 q12, q8, #8
1375 vrshr.u16 q13, q9, #8
1376 vraddhn.u16 d0, q6, q10
1377 vraddhn.u16 d1, q7, q11
1378 vraddhn.u16 d2, q8, q12
1379 vraddhn.u16 d3, q9, q13
1380 vmvn.8 d25, d3 /* get inverted alpha */
1381 /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */
1382 /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
1383 /* now do alpha blending */
1384 vmull.u8 q8, d25, d4
1385 vmull.u8 q9, d25, d5
1386 vmull.u8 q10, d25, d6
1387 vmull.u8 q11, d25, d7
1390 .macro pixman_composite_over_n_8_8888_process_pixblock_tail
1391 vrshr.u16 q14, q8, #8
1392 vrshr.u16 q15, q9, #8
1393 vrshr.u16 q6, q10, #8
1394 vrshr.u16 q7, q11, #8
1395 vraddhn.u16 d28, q14, q8
1396 vraddhn.u16 d29, q15, q9
1397 vraddhn.u16 d30, q6, q10
1398 vraddhn.u16 d31, q7, q11
1399 vqadd.u8 q14, q0, q14
1400 vqadd.u8 q15, q1, q15
1403 .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
1404 vrshr.u16 q14, q8, #8
1405 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1406 vrshr.u16 q15, q9, #8
1408 vrshr.u16 q6, q10, #8
1409 PF add PF_X, PF_X, #8
1410 vrshr.u16 q7, q11, #8
1411 PF tst PF_CTL, #0x0F
1412 vraddhn.u16 d28, q14, q8
1413 PF addne PF_X, PF_X, #8
1414 vraddhn.u16 d29, q15, q9
1415 PF subne PF_CTL, PF_CTL, #1
1416 vraddhn.u16 d30, q6, q10
1418 vraddhn.u16 d31, q7, q11
1419 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1420 vmull.u8 q6, d24, d8
1421 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1422 vmull.u8 q7, d24, d9
1423 PF subge PF_X, PF_X, ORIG_W
1424 vmull.u8 q8, d24, d10
1425 PF subges PF_CTL, PF_CTL, #0x10
1426 vmull.u8 q9, d24, d11
1427 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1428 vqadd.u8 q14, q0, q14
1429 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1430 vqadd.u8 q15, q1, q15
1431 vrshr.u16 q10, q6, #8
1432 vrshr.u16 q11, q7, #8
1433 vrshr.u16 q12, q8, #8
1434 vrshr.u16 q13, q9, #8
1435 vraddhn.u16 d0, q6, q10
1436 vraddhn.u16 d1, q7, q11
1437 vraddhn.u16 d2, q8, q12
1438 vraddhn.u16 d3, q9, q13
1439 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1441 vmull.u8 q8, d25, d4
1442 vmull.u8 q9, d25, d5
1443 vmull.u8 q10, d25, d6
1444 vmull.u8 q11, d25, d7
1447 .macro pixman_composite_over_n_8_8888_init
1448 add DUMMY, sp, #ARGS_STACK_OFFSET
1450 vld1.32 {d11[0]}, [DUMMY]
1457 .macro pixman_composite_over_n_8_8888_cleanup
1461 generate_composite_function \
1462 pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
1463 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1464 8, /* number of pixels, processed in a single block */ \
1465 5, /* prefetch distance */ \
1466 pixman_composite_over_n_8_8888_init, \
1467 pixman_composite_over_n_8_8888_cleanup, \
1468 pixman_composite_over_n_8_8888_process_pixblock_head, \
1469 pixman_composite_over_n_8_8888_process_pixblock_tail, \
1470 pixman_composite_over_n_8_8888_process_pixblock_tail_head
1472 /******************************************************************************/
1474 .macro pixman_composite_over_n_8_8_process_pixblock_head
1475 vmull.u8 q0, d24, d8
1476 vmull.u8 q1, d25, d8
1477 vmull.u8 q6, d26, d8
1478 vmull.u8 q7, d27, d8
1479 vrshr.u16 q10, q0, #8
1480 vrshr.u16 q11, q1, #8
1481 vrshr.u16 q12, q6, #8
1482 vrshr.u16 q13, q7, #8
1483 vraddhn.u16 d0, q0, q10
1484 vraddhn.u16 d1, q1, q11
1485 vraddhn.u16 d2, q6, q12
1486 vraddhn.u16 d3, q7, q13
1489 vmull.u8 q8, d24, d4
1490 vmull.u8 q9, d25, d5
1491 vmull.u8 q10, d26, d6
1492 vmull.u8 q11, d27, d7
1495 .macro pixman_composite_over_n_8_8_process_pixblock_tail
1496 vrshr.u16 q14, q8, #8
1497 vrshr.u16 q15, q9, #8
1498 vrshr.u16 q12, q10, #8
1499 vrshr.u16 q13, q11, #8
1500 vraddhn.u16 d28, q14, q8
1501 vraddhn.u16 d29, q15, q9
1502 vraddhn.u16 d30, q12, q10
1503 vraddhn.u16 d31, q13, q11
1504 vqadd.u8 q14, q0, q14
1505 vqadd.u8 q15, q1, q15
1508 /* TODO: expand macros and do better instructions scheduling */
1509 .macro pixman_composite_over_n_8_8_process_pixblock_tail_head
1510 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1511 pixman_composite_over_n_8_8_process_pixblock_tail
1513 cache_preload 32, 32
1514 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1515 pixman_composite_over_n_8_8_process_pixblock_head
1518 .macro pixman_composite_over_n_8_8_init
1519 add DUMMY, sp, #ARGS_STACK_OFFSET
1521 vld1.32 {d8[0]}, [DUMMY]
1525 .macro pixman_composite_over_n_8_8_cleanup
1529 generate_composite_function \
1530 pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
1531 FLAG_DST_READWRITE, \
1532 32, /* number of pixels, processed in a single block */ \
1533 5, /* prefetch distance */ \
1534 pixman_composite_over_n_8_8_init, \
1535 pixman_composite_over_n_8_8_cleanup, \
1536 pixman_composite_over_n_8_8_process_pixblock_head, \
1537 pixman_composite_over_n_8_8_process_pixblock_tail, \
1538 pixman_composite_over_n_8_8_process_pixblock_tail_head
1540 /******************************************************************************/
1542 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1544 * 'combine_mask_ca' replacement
1546 * input: solid src (n) in {d8, d9, d10, d11}
1547 * dest in {d4, d5, d6, d7 }
1548 * mask in {d24, d25, d26, d27}
1549 * output: updated src in {d0, d1, d2, d3 }
1550 * updated mask in {d24, d25, d26, d3 }
1552 vmull.u8 q0, d24, d8
1553 vmull.u8 q1, d25, d9
1554 vmull.u8 q6, d26, d10
1555 vmull.u8 q7, d27, d11
1556 vmull.u8 q9, d11, d25
1557 vmull.u8 q12, d11, d24
1558 vmull.u8 q13, d11, d26
1559 vrshr.u16 q8, q0, #8
1560 vrshr.u16 q10, q1, #8
1561 vrshr.u16 q11, q6, #8
1562 vraddhn.u16 d0, q0, q8
1563 vraddhn.u16 d1, q1, q10
1564 vraddhn.u16 d2, q6, q11
1565 vrshr.u16 q11, q12, #8
1566 vrshr.u16 q8, q9, #8
1567 vrshr.u16 q6, q13, #8
1568 vrshr.u16 q10, q7, #8
1569 vraddhn.u16 d24, q12, q11
1570 vraddhn.u16 d25, q9, q8
1571 vraddhn.u16 d26, q13, q6
1572 vraddhn.u16 d3, q7, q10
1574 * 'combine_over_ca' replacement
1576 * output: updated dest in {d28, d29, d30, d31}
1580 vmull.u8 q8, d24, d4
1581 vmull.u8 q9, d25, d5
1583 vmull.u8 q10, d26, d6
1584 vmull.u8 q11, d27, d7
1587 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
1588 /* ... continue 'combine_over_ca' replacement */
1589 vrshr.u16 q14, q8, #8
1590 vrshr.u16 q15, q9, #8
1591 vrshr.u16 q6, q10, #8
1592 vrshr.u16 q7, q11, #8
1593 vraddhn.u16 d28, q14, q8
1594 vraddhn.u16 d29, q15, q9
1595 vraddhn.u16 d30, q6, q10
1596 vraddhn.u16 d31, q7, q11
1597 vqadd.u8 q14, q0, q14
1598 vqadd.u8 q15, q1, q15
1601 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1602 vrshr.u16 q14, q8, #8
1603 vrshr.u16 q15, q9, #8
1604 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1605 vrshr.u16 q6, q10, #8
1606 vrshr.u16 q7, q11, #8
1607 vraddhn.u16 d28, q14, q8
1608 vraddhn.u16 d29, q15, q9
1609 vraddhn.u16 d30, q6, q10
1610 vraddhn.u16 d31, q7, q11
1612 vqadd.u8 q14, q0, q14
1613 vqadd.u8 q15, q1, q15
1615 pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1616 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1619 .macro pixman_composite_over_n_8888_8888_ca_init
1620 add DUMMY, sp, #ARGS_STACK_OFFSET
1622 vld1.32 {d11[0]}, [DUMMY]
1629 .macro pixman_composite_over_n_8888_8888_ca_cleanup
1633 generate_composite_function \
1634 pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
1635 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1636 8, /* number of pixels, processed in a single block */ \
1637 5, /* prefetch distance */ \
1638 pixman_composite_over_n_8888_8888_ca_init, \
1639 pixman_composite_over_n_8888_8888_ca_cleanup, \
1640 pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
1641 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
1642 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1644 /******************************************************************************/
1646 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
1648 * 'combine_mask_ca' replacement
1650 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A]
1651 * mask in {d24, d25, d26} [B, G, R]
1652 * output: updated src in {d0, d1, d2 } [B, G, R]
1653 * updated mask in {d24, d25, d26} [B, G, R]
1655 vmull.u8 q0, d24, d8
1656 vmull.u8 q1, d25, d9
1657 vmull.u8 q6, d26, d10
1658 vmull.u8 q9, d11, d25
1659 vmull.u8 q12, d11, d24
1660 vmull.u8 q13, d11, d26
1661 vrshr.u16 q8, q0, #8
1662 vrshr.u16 q10, q1, #8
1663 vrshr.u16 q11, q6, #8
1664 vraddhn.u16 d0, q0, q8
1665 vraddhn.u16 d1, q1, q10
1666 vraddhn.u16 d2, q6, q11
1667 vrshr.u16 q11, q12, #8
1668 vrshr.u16 q8, q9, #8
1669 vrshr.u16 q6, q13, #8
1670 vraddhn.u16 d24, q12, q11
1671 vraddhn.u16 d25, q9, q8
1673 * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
1674 * and put data into d16 - blue, d17 - green, d18 - red
1676 vshrn.u16 d17, q2, #3
1677 vshrn.u16 d18, q2, #8
1678 vraddhn.u16 d26, q13, q6
1680 vsri.u8 d18, d18, #5
1681 vsri.u8 d17, d17, #6
1683 * 'combine_over_ca' replacement
1685 * output: updated dest in d16 - blue, d17 - green, d18 - red
1688 vshrn.u16 d16, q2, #2
1690 vmull.u8 q6, d16, d24
1691 vmull.u8 q7, d17, d25
1692 vmull.u8 q11, d18, d26
1695 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
1696 /* ... continue 'combine_over_ca' replacement */
1697 vrshr.u16 q10, q6, #8
1698 vrshr.u16 q14, q7, #8
1699 vrshr.u16 q15, q11, #8
1700 vraddhn.u16 d16, q10, q6
1701 vraddhn.u16 d17, q14, q7
1702 vraddhn.u16 d18, q15, q11
1704 vqadd.u8 d18, d2, d18
1706 * convert the results in d16, d17, d18 to r5g6b5 and store
1707 * them into {d28, d29}
1709 vshll.u8 q14, d18, #8
1710 vshll.u8 q10, d17, #8
1711 vshll.u8 q15, d16, #8
1712 vsri.u16 q14, q10, #5
1713 vsri.u16 q14, q15, #11
1716 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
1718 vrshr.u16 q10, q6, #8
1719 vrshr.u16 q14, q7, #8
1720 vld1.16 {d4, d5}, [DST_R, :128]!
1721 vrshr.u16 q15, q11, #8
1722 vraddhn.u16 d16, q10, q6
1723 vraddhn.u16 d17, q14, q7
1724 vraddhn.u16 d22, q15, q11
1725 /* process_pixblock_head */
1727 * 'combine_mask_ca' replacement
1729 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A]
1730 * mask in {d24, d25, d26} [B, G, R]
1731 * output: updated src in {d0, d1, d2 } [B, G, R]
1732 * updated mask in {d24, d25, d26} [B, G, R]
1734 vmull.u8 q6, d26, d10
1736 vmull.u8 q0, d24, d8
1737 vqadd.u8 d22, d2, d22
1738 vmull.u8 q1, d25, d9
1740 * convert the result in d16, d17, d22 to r5g6b5 and store
1741 * it into {d28, d29}
1743 vshll.u8 q14, d22, #8
1744 vshll.u8 q10, d17, #8
1745 vshll.u8 q15, d16, #8
1746 vmull.u8 q9, d11, d25
1747 vsri.u16 q14, q10, #5
1748 vmull.u8 q12, d11, d24
1749 vmull.u8 q13, d11, d26
1750 vsri.u16 q14, q15, #11
1752 vrshr.u16 q8, q0, #8
1753 vrshr.u16 q10, q1, #8
1754 vrshr.u16 q11, q6, #8
1755 vraddhn.u16 d0, q0, q8
1756 vraddhn.u16 d1, q1, q10
1757 vraddhn.u16 d2, q6, q11
1758 vrshr.u16 q11, q12, #8
1759 vrshr.u16 q8, q9, #8
1760 vrshr.u16 q6, q13, #8
1761 vraddhn.u16 d24, q12, q11
1762 vraddhn.u16 d25, q9, q8
1764 * convert 8 r5g6b5 pixel data from {d4, d5} to planar
1765 * 8-bit format and put data into d16 - blue, d17 - green,
1768 vshrn.u16 d17, q2, #3
1769 vshrn.u16 d18, q2, #8
1770 vraddhn.u16 d26, q13, q6
1772 vsri.u8 d17, d17, #6
1773 vsri.u8 d18, d18, #5
1775 * 'combine_over_ca' replacement
1777 * output: updated dest in d16 - blue, d17 - green, d18 - red
1780 vshrn.u16 d16, q2, #2
1782 vmull.u8 q7, d17, d25
1783 vmull.u8 q6, d16, d24
1784 vmull.u8 q11, d18, d26
1785 vst1.16 {d28, d29}, [DST_W, :128]!
1788 .macro pixman_composite_over_n_8888_0565_ca_init
1789 add DUMMY, sp, #ARGS_STACK_OFFSET
1791 vld1.32 {d11[0]}, [DUMMY]
1798 .macro pixman_composite_over_n_8888_0565_ca_cleanup
1802 generate_composite_function \
1803 pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
1804 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1805 8, /* number of pixels, processed in a single block */ \
1806 5, /* prefetch distance */ \
1807 pixman_composite_over_n_8888_0565_ca_init, \
1808 pixman_composite_over_n_8888_0565_ca_cleanup, \
1809 pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
1810 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
1811 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
1813 /******************************************************************************/
1815 .macro pixman_composite_in_n_8_process_pixblock_head
1816 /* expecting source data in {d0, d1, d2, d3} */
1817 /* and destination data in {d4, d5, d6, d7} */
1820 vmull.u8 q10, d6, d3
1821 vmull.u8 q11, d7, d3
1824 .macro pixman_composite_in_n_8_process_pixblock_tail
1825 vrshr.u16 q14, q8, #8
1826 vrshr.u16 q15, q9, #8
1827 vrshr.u16 q12, q10, #8
1828 vrshr.u16 q13, q11, #8
1829 vraddhn.u16 d28, q8, q14
1830 vraddhn.u16 d29, q9, q15
1831 vraddhn.u16 d30, q10, q12
1832 vraddhn.u16 d31, q11, q13
1835 .macro pixman_composite_in_n_8_process_pixblock_tail_head
1836 pixman_composite_in_n_8_process_pixblock_tail
1837 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1838 cache_preload 32, 32
1839 pixman_composite_in_n_8_process_pixblock_head
1840 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1843 .macro pixman_composite_in_n_8_init
1844 add DUMMY, sp, #ARGS_STACK_OFFSET
1845 vld1.32 {d3[0]}, [DUMMY]
1849 .macro pixman_composite_in_n_8_cleanup
1852 generate_composite_function \
1853 pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
1854 FLAG_DST_READWRITE, \
1855 32, /* number of pixels, processed in a single block */ \
1856 5, /* prefetch distance */ \
1857 pixman_composite_in_n_8_init, \
1858 pixman_composite_in_n_8_cleanup, \
1859 pixman_composite_in_n_8_process_pixblock_head, \
1860 pixman_composite_in_n_8_process_pixblock_tail, \
1861 pixman_composite_in_n_8_process_pixblock_tail_head, \
1862 28, /* dst_w_basereg */ \
1863 4, /* dst_r_basereg */ \
1864 0, /* src_basereg */ \
1865 24 /* mask_basereg */
1867 .macro pixman_composite_add_n_8_8_process_pixblock_head
1868 /* expecting source data in {d8, d9, d10, d11} */
1869 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1870 /* and destination data in {d4, d5, d6, d7} */
1871 /* mask is in d24, d25, d26, d27 */
1872 vmull.u8 q0, d24, d11
1873 vmull.u8 q1, d25, d11
1874 vmull.u8 q6, d26, d11
1875 vmull.u8 q7, d27, d11
1876 vrshr.u16 q10, q0, #8
1877 vrshr.u16 q11, q1, #8
1878 vrshr.u16 q12, q6, #8
1879 vrshr.u16 q13, q7, #8
1880 vraddhn.u16 d0, q0, q10
1881 vraddhn.u16 d1, q1, q11
1882 vraddhn.u16 d2, q6, q12
1883 vraddhn.u16 d3, q7, q13
1884 vqadd.u8 q14, q0, q2
1885 vqadd.u8 q15, q1, q3
1888 .macro pixman_composite_add_n_8_8_process_pixblock_tail
1891 /* TODO: expand macros and do better instructions scheduling */
1892 .macro pixman_composite_add_n_8_8_process_pixblock_tail_head
1893 pixman_composite_add_n_8_8_process_pixblock_tail
1894 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1895 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1897 cache_preload 32, 32
1898 pixman_composite_add_n_8_8_process_pixblock_head
1901 .macro pixman_composite_add_n_8_8_init
1902 add DUMMY, sp, #ARGS_STACK_OFFSET
1904 vld1.32 {d11[0]}, [DUMMY]
1908 .macro pixman_composite_add_n_8_8_cleanup
1912 generate_composite_function \
1913 pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
1914 FLAG_DST_READWRITE, \
1915 32, /* number of pixels, processed in a single block */ \
1916 5, /* prefetch distance */ \
1917 pixman_composite_add_n_8_8_init, \
1918 pixman_composite_add_n_8_8_cleanup, \
1919 pixman_composite_add_n_8_8_process_pixblock_head, \
1920 pixman_composite_add_n_8_8_process_pixblock_tail, \
1921 pixman_composite_add_n_8_8_process_pixblock_tail_head
1923 /******************************************************************************/
1925 .macro pixman_composite_add_8_8_8_process_pixblock_head
1926 /* expecting source data in {d0, d1, d2, d3} */
1927 /* destination data in {d4, d5, d6, d7} */
1928 /* mask in {d24, d25, d26, d27} */
1929 vmull.u8 q8, d24, d0
1930 vmull.u8 q9, d25, d1
1931 vmull.u8 q10, d26, d2
1932 vmull.u8 q11, d27, d3
1933 vrshr.u16 q0, q8, #8
1934 vrshr.u16 q1, q9, #8
1935 vrshr.u16 q12, q10, #8
1936 vrshr.u16 q13, q11, #8
1937 vraddhn.u16 d0, q0, q8
1938 vraddhn.u16 d1, q1, q9
1939 vraddhn.u16 d2, q12, q10
1940 vraddhn.u16 d3, q13, q11
1941 vqadd.u8 q14, q0, q2
1942 vqadd.u8 q15, q1, q3
1945 .macro pixman_composite_add_8_8_8_process_pixblock_tail
1948 /* TODO: expand macros and do better instructions scheduling */
1949 .macro pixman_composite_add_8_8_8_process_pixblock_tail_head
1950 pixman_composite_add_8_8_8_process_pixblock_tail
1951 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1952 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1955 cache_preload 32, 32
1956 pixman_composite_add_8_8_8_process_pixblock_head
1959 .macro pixman_composite_add_8_8_8_init
1962 .macro pixman_composite_add_8_8_8_cleanup
1965 generate_composite_function \
1966 pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
1967 FLAG_DST_READWRITE, \
1968 32, /* number of pixels, processed in a single block */ \
1969 5, /* prefetch distance */ \
1970 pixman_composite_add_8_8_8_init, \
1971 pixman_composite_add_8_8_8_cleanup, \
1972 pixman_composite_add_8_8_8_process_pixblock_head, \
1973 pixman_composite_add_8_8_8_process_pixblock_tail, \
1974 pixman_composite_add_8_8_8_process_pixblock_tail_head
1976 /******************************************************************************/
1978 .macro pixman_composite_add_8888_8888_8888_process_pixblock_head
1979 /* expecting source data in {d0, d1, d2, d3} */
1980 /* destination data in {d4, d5, d6, d7} */
1981 /* mask in {d24, d25, d26, d27} */
1982 vmull.u8 q8, d27, d0
1983 vmull.u8 q9, d27, d1
1984 vmull.u8 q10, d27, d2
1985 vmull.u8 q11, d27, d3
1986 /* 1 cycle bubble */
1987 vrsra.u16 q8, q8, #8
1988 vrsra.u16 q9, q9, #8
1989 vrsra.u16 q10, q10, #8
1990 vrsra.u16 q11, q11, #8
1993 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
1994 /* 2 cycle bubble */
1995 vrshrn.u16 d28, q8, #8
1996 vrshrn.u16 d29, q9, #8
1997 vrshrn.u16 d30, q10, #8
1998 vrshrn.u16 d31, q11, #8
1999 vqadd.u8 q14, q2, q14
2000 /* 1 cycle bubble */
2001 vqadd.u8 q15, q3, q15
2004 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
2006 vrshrn.u16 d28, q8, #8
2008 vrshrn.u16 d29, q9, #8
2009 vmull.u8 q8, d27, d0
2010 vrshrn.u16 d30, q10, #8
2011 vmull.u8 q9, d27, d1
2012 vrshrn.u16 d31, q11, #8
2013 vmull.u8 q10, d27, d2
2014 vqadd.u8 q14, q2, q14
2015 vmull.u8 q11, d27, d3
2016 vqadd.u8 q15, q3, q15
2017 vrsra.u16 q8, q8, #8
2018 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2019 vrsra.u16 q9, q9, #8
2020 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2021 vrsra.u16 q10, q10, #8
2025 vrsra.u16 q11, q11, #8
2028 generate_composite_function \
2029 pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
2030 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2031 8, /* number of pixels, processed in a single block */ \
2032 10, /* prefetch distance */ \
2035 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2036 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2037 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
2039 generate_composite_function_single_scanline \
2040 pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
2041 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2042 8, /* number of pixels, processed in a single block */ \
2045 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2046 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2047 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
2049 /******************************************************************************/
2051 generate_composite_function \
2052 pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
2053 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2054 8, /* number of pixels, processed in a single block */ \
2055 5, /* prefetch distance */ \
2058 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2059 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2060 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
2061 28, /* dst_w_basereg */ \
2062 4, /* dst_r_basereg */ \
2063 0, /* src_basereg */ \
2064 27 /* mask_basereg */
2066 /******************************************************************************/
2068 .macro pixman_composite_add_n_8_8888_init
2069 add DUMMY, sp, #ARGS_STACK_OFFSET
2070 vld1.32 {d3[0]}, [DUMMY]
2077 .macro pixman_composite_add_n_8_8888_cleanup
2080 generate_composite_function \
2081 pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
2082 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2083 8, /* number of pixels, processed in a single block */ \
2084 5, /* prefetch distance */ \
2085 pixman_composite_add_n_8_8888_init, \
2086 pixman_composite_add_n_8_8888_cleanup, \
2087 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2088 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2089 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
2090 28, /* dst_w_basereg */ \
2091 4, /* dst_r_basereg */ \
2092 0, /* src_basereg */ \
2093 27 /* mask_basereg */
2095 /******************************************************************************/
2097 .macro pixman_composite_add_8888_n_8888_init
2098 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
2099 vld1.32 {d27[0]}, [DUMMY]
2103 .macro pixman_composite_add_8888_n_8888_cleanup
2106 generate_composite_function \
2107 pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
2108 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2109 8, /* number of pixels, processed in a single block */ \
2110 5, /* prefetch distance */ \
2111 pixman_composite_add_8888_n_8888_init, \
2112 pixman_composite_add_8888_n_8888_cleanup, \
2113 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2114 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2115 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
2116 28, /* dst_w_basereg */ \
2117 4, /* dst_r_basereg */ \
2118 0, /* src_basereg */ \
2119 27 /* mask_basereg */
2121 /******************************************************************************/
2123 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
2124 /* expecting source data in {d0, d1, d2, d3} */
2125 /* destination data in {d4, d5, d6, d7} */
2126 /* solid mask is in d15 */
2129 vmull.u8 q8, d15, d3
2130 vmull.u8 q6, d15, d2
2131 vmull.u8 q5, d15, d1
2132 vmull.u8 q4, d15, d0
2133 vrshr.u16 q13, q8, #8
2134 vrshr.u16 q12, q6, #8
2135 vrshr.u16 q11, q5, #8
2136 vrshr.u16 q10, q4, #8
2137 vraddhn.u16 d3, q8, q13
2138 vraddhn.u16 d2, q6, q12
2139 vraddhn.u16 d1, q5, q11
2140 vraddhn.u16 d0, q4, q10
2141 vmvn.8 d24, d3 /* get inverted alpha */
2142 /* now do alpha blending */
2143 vmull.u8 q8, d24, d4
2144 vmull.u8 q9, d24, d5
2145 vmull.u8 q10, d24, d6
2146 vmull.u8 q11, d24, d7
2149 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2150 vrshr.u16 q14, q8, #8
2151 vrshr.u16 q15, q9, #8
2152 vrshr.u16 q12, q10, #8
2153 vrshr.u16 q13, q11, #8
2154 vraddhn.u16 d28, q14, q8
2155 vraddhn.u16 d29, q15, q9
2156 vraddhn.u16 d30, q12, q10
2157 vraddhn.u16 d31, q13, q11
2160 /* TODO: expand macros and do better instructions scheduling */
2161 .macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
2162 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2163 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2167 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
2168 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2171 generate_composite_function_single_scanline \
2172 pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
2173 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2174 8, /* number of pixels, processed in a single block */ \
2175 default_init_need_all_regs, \
2176 default_cleanup_need_all_regs, \
2177 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
2178 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
2179 pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
2180 28, /* dst_w_basereg */ \
2181 4, /* dst_r_basereg */ \
2182 0, /* src_basereg */ \
2183 12 /* mask_basereg */
2185 /******************************************************************************/
2187 .macro pixman_composite_over_8888_n_8888_process_pixblock_head
2188 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
2191 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail
2192 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2193 vqadd.u8 q14, q0, q14
2194 vqadd.u8 q15, q1, q15
2197 /* TODO: expand macros and do better instructions scheduling */
2198 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
2199 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2200 pixman_composite_over_8888_n_8888_process_pixblock_tail
2203 pixman_composite_over_8888_n_8888_process_pixblock_head
2204 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2207 .macro pixman_composite_over_8888_n_8888_init
2210 vld1.32 {d15[0]}, [DUMMY]
2214 .macro pixman_composite_over_8888_n_8888_cleanup
2218 generate_composite_function \
2219 pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
2220 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2221 8, /* number of pixels, processed in a single block */ \
2222 5, /* prefetch distance */ \
2223 pixman_composite_over_8888_n_8888_init, \
2224 pixman_composite_over_8888_n_8888_cleanup, \
2225 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2226 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2227 pixman_composite_over_8888_n_8888_process_pixblock_tail_head
2229 /******************************************************************************/
2231 /* TODO: expand macros and do better instructions scheduling */
2232 .macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
2233 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2234 pixman_composite_over_8888_n_8888_process_pixblock_tail
2238 pixman_composite_over_8888_n_8888_process_pixblock_head
2239 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2242 generate_composite_function \
2243 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
2244 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2245 8, /* number of pixels, processed in a single block */ \
2246 5, /* prefetch distance */ \
2247 default_init_need_all_regs, \
2248 default_cleanup_need_all_regs, \
2249 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2250 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2251 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
2252 28, /* dst_w_basereg */ \
2253 4, /* dst_r_basereg */ \
2254 0, /* src_basereg */ \
2255 12 /* mask_basereg */
2257 generate_composite_function_single_scanline \
2258 pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
2259 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2260 8, /* number of pixels, processed in a single block */ \
2261 default_init_need_all_regs, \
2262 default_cleanup_need_all_regs, \
2263 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2264 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2265 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
2266 28, /* dst_w_basereg */ \
2267 4, /* dst_r_basereg */ \
2268 0, /* src_basereg */ \
2269 12 /* mask_basereg */
2271 /******************************************************************************/
2273 /* TODO: expand macros and do better instructions scheduling */
2274 .macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
2275 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2276 pixman_composite_over_8888_n_8888_process_pixblock_tail
2280 pixman_composite_over_8888_n_8888_process_pixblock_head
2281 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2284 generate_composite_function \
2285 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
2286 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2287 8, /* number of pixels, processed in a single block */ \
2288 5, /* prefetch distance */ \
2289 default_init_need_all_regs, \
2290 default_cleanup_need_all_regs, \
2291 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2292 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2293 pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
2294 28, /* dst_w_basereg */ \
2295 4, /* dst_r_basereg */ \
2296 0, /* src_basereg */ \
2297 15 /* mask_basereg */
2299 /******************************************************************************/
2301 .macro pixman_composite_src_0888_0888_process_pixblock_head
2304 .macro pixman_composite_src_0888_0888_process_pixblock_tail
2307 .macro pixman_composite_src_0888_0888_process_pixblock_tail_head
2308 vst3.8 {d0, d1, d2}, [DST_W]!
2313 generate_composite_function \
2314 pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
2315 FLAG_DST_WRITEONLY, \
2316 8, /* number of pixels, processed in a single block */ \
2317 10, /* prefetch distance */ \
2320 pixman_composite_src_0888_0888_process_pixblock_head, \
2321 pixman_composite_src_0888_0888_process_pixblock_tail, \
2322 pixman_composite_src_0888_0888_process_pixblock_tail_head, \
2323 0, /* dst_w_basereg */ \
2324 0, /* dst_r_basereg */ \
2325 0, /* src_basereg */ \
2326 0 /* mask_basereg */
2328 /******************************************************************************/
2330 .macro pixman_composite_src_0888_8888_rev_process_pixblock_head
2334 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
2337 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
2338 vst4.8 {d0, d1, d2, d3}, [DST_W]!
2344 .macro pixman_composite_src_0888_8888_rev_init
2348 generate_composite_function \
2349 pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
2350 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2351 8, /* number of pixels, processed in a single block */ \
2352 10, /* prefetch distance */ \
2353 pixman_composite_src_0888_8888_rev_init, \
2355 pixman_composite_src_0888_8888_rev_process_pixblock_head, \
2356 pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
2357 pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
2358 0, /* dst_w_basereg */ \
2359 0, /* dst_r_basereg */ \
2360 0, /* src_basereg */ \
2361 0 /* mask_basereg */
2363 /******************************************************************************/
2365 .macro pixman_composite_src_0888_0565_rev_process_pixblock_head
2370 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
2371 vshll.u8 q14, d0, #8
2372 vsri.u16 q14, q8, #5
2373 vsri.u16 q14, q9, #11
2376 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
2377 vshll.u8 q14, d0, #8
2379 vsri.u16 q14, q8, #5
2380 vsri.u16 q14, q9, #11
2382 vst1.16 {d28, d29}, [DST_W, :128]!
2386 generate_composite_function \
2387 pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
2388 FLAG_DST_WRITEONLY, \
2389 8, /* number of pixels, processed in a single block */ \
2390 10, /* prefetch distance */ \
2393 pixman_composite_src_0888_0565_rev_process_pixblock_head, \
2394 pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
2395 pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
2396 28, /* dst_w_basereg */ \
2397 0, /* dst_r_basereg */ \
2398 0, /* src_basereg */ \
2399 0 /* mask_basereg */
2401 /******************************************************************************/
2403 .macro pixman_composite_src_pixbuf_8888_process_pixblock_head
2406 vmull.u8 q10, d3, d2
2409 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
2410 vrshr.u16 q11, q8, #8
2412 vrshr.u16 q12, q9, #8
2413 vrshr.u16 q13, q10, #8
2414 vraddhn.u16 d30, q11, q8
2415 vraddhn.u16 d29, q12, q9
2416 vraddhn.u16 d28, q13, q10
2419 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
2420 vrshr.u16 q11, q8, #8
2422 vrshr.u16 q12, q9, #8
2423 vrshr.u16 q13, q10, #8
2425 vraddhn.u16 d30, q11, q8
2426 PF add PF_X, PF_X, #8
2428 PF addne PF_X, PF_X, #8
2429 PF subne PF_CTL, PF_CTL, #1
2430 vraddhn.u16 d29, q12, q9
2431 vraddhn.u16 d28, q13, q10
2434 vmull.u8 q10, d3, d2
2435 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2437 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
2438 PF subge PF_X, PF_X, ORIG_W
2439 PF subges PF_CTL, PF_CTL, #0x10
2440 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2443 generate_composite_function \
2444 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
2445 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2446 8, /* number of pixels, processed in a single block */ \
2447 10, /* prefetch distance */ \
2450 pixman_composite_src_pixbuf_8888_process_pixblock_head, \
2451 pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
2452 pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
2453 28, /* dst_w_basereg */ \
2454 0, /* dst_r_basereg */ \
2455 0, /* src_basereg */ \
2456 0 /* mask_basereg */
2458 /******************************************************************************/
2460 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
2463 vmull.u8 q10, d3, d2
2466 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
2467 vrshr.u16 q11, q8, #8
2469 vrshr.u16 q12, q9, #8
2470 vrshr.u16 q13, q10, #8
2471 vraddhn.u16 d28, q11, q8
2472 vraddhn.u16 d29, q12, q9
2473 vraddhn.u16 d30, q13, q10
2476 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
2477 vrshr.u16 q11, q8, #8
2479 vrshr.u16 q12, q9, #8
2480 vrshr.u16 q13, q10, #8
2482 vraddhn.u16 d28, q11, q8
2483 PF add PF_X, PF_X, #8
2485 PF addne PF_X, PF_X, #8
2486 PF subne PF_CTL, PF_CTL, #1
2487 vraddhn.u16 d29, q12, q9
2488 vraddhn.u16 d30, q13, q10
2491 vmull.u8 q10, d3, d2
2492 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2494 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
2495 PF subge PF_X, PF_X, ORIG_W
2496 PF subges PF_CTL, PF_CTL, #0x10
2497 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2500 generate_composite_function \
2501 pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
2502 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2503 8, /* number of pixels, processed in a single block */ \
2504 10, /* prefetch distance */ \
2507 pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
2508 pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
2509 pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
2510 28, /* dst_w_basereg */ \
2511 0, /* dst_r_basereg */ \
2512 0, /* src_basereg */ \
2513 0 /* mask_basereg */
2515 /******************************************************************************/
2517 .macro pixman_composite_over_0565_8_0565_process_pixblock_head
2518 /* mask is in d15 */
2519 convert_0565_to_x888 q4, d2, d1, d0
2520 convert_0565_to_x888 q5, d6, d5, d4
2521 /* source pixel data is in {d0, d1, d2, XX} */
2522 /* destination pixel data is in {d4, d5, d6, XX} */
2524 vmull.u8 q6, d15, d2
2525 vmull.u8 q5, d15, d1
2526 vmull.u8 q4, d15, d0
2529 vmull.u8 q13, d7, d6
2530 vrshr.u16 q12, q6, #8
2531 vrshr.u16 q11, q5, #8
2532 vrshr.u16 q10, q4, #8
2533 vraddhn.u16 d2, q6, q12
2534 vraddhn.u16 d1, q5, q11
2535 vraddhn.u16 d0, q4, q10
2538 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail
2539 vrshr.u16 q14, q8, #8
2540 vrshr.u16 q15, q9, #8
2541 vrshr.u16 q12, q13, #8
2542 vraddhn.u16 d28, q14, q8
2543 vraddhn.u16 d29, q15, q9
2544 vraddhn.u16 d30, q12, q13
2545 vqadd.u8 q0, q0, q14
2546 vqadd.u8 q1, q1, q15
2547 /* 32bpp result is in {d0, d1, d2, XX} */
2548 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2551 /* TODO: expand macros and do better instructions scheduling */
2552 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
2554 pixman_composite_over_0565_8_0565_process_pixblock_tail
2556 vld1.16 {d10, d11}, [DST_R, :128]!
2558 pixman_composite_over_0565_8_0565_process_pixblock_head
2559 vst1.16 {d28, d29}, [DST_W, :128]!
2562 generate_composite_function \
2563 pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
2564 FLAG_DST_READWRITE, \
2565 8, /* number of pixels, processed in a single block */ \
2566 5, /* prefetch distance */ \
2567 default_init_need_all_regs, \
2568 default_cleanup_need_all_regs, \
2569 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2570 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2571 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2572 28, /* dst_w_basereg */ \
2573 10, /* dst_r_basereg */ \
2574 8, /* src_basereg */ \
2575 15 /* mask_basereg */
2577 /******************************************************************************/
2579 .macro pixman_composite_over_0565_n_0565_init
2580 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
2582 vld1.32 {d15[0]}, [DUMMY]
2586 .macro pixman_composite_over_0565_n_0565_cleanup
2590 generate_composite_function \
2591 pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
2592 FLAG_DST_READWRITE, \
2593 8, /* number of pixels, processed in a single block */ \
2594 5, /* prefetch distance */ \
2595 pixman_composite_over_0565_n_0565_init, \
2596 pixman_composite_over_0565_n_0565_cleanup, \
2597 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2598 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2599 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2600 28, /* dst_w_basereg */ \
2601 10, /* dst_r_basereg */ \
2602 8, /* src_basereg */ \
2603 15 /* mask_basereg */
2605 /******************************************************************************/
2607 .macro pixman_composite_add_0565_8_0565_process_pixblock_head
2608 /* mask is in d15 */
2609 convert_0565_to_x888 q4, d2, d1, d0
2610 convert_0565_to_x888 q5, d6, d5, d4
2611 /* source pixel data is in {d0, d1, d2, XX} */
2612 /* destination pixel data is in {d4, d5, d6, XX} */
2613 vmull.u8 q6, d15, d2
2614 vmull.u8 q5, d15, d1
2615 vmull.u8 q4, d15, d0
2616 vrshr.u16 q12, q6, #8
2617 vrshr.u16 q11, q5, #8
2618 vrshr.u16 q10, q4, #8
2619 vraddhn.u16 d2, q6, q12
2620 vraddhn.u16 d1, q5, q11
2621 vraddhn.u16 d0, q4, q10
2624 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail
2627 /* 32bpp result is in {d0, d1, d2, XX} */
2628 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2631 /* TODO: expand macros and do better instructions scheduling */
2632 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
2634 pixman_composite_add_0565_8_0565_process_pixblock_tail
2636 vld1.16 {d10, d11}, [DST_R, :128]!
2638 pixman_composite_add_0565_8_0565_process_pixblock_head
2639 vst1.16 {d28, d29}, [DST_W, :128]!
2642 generate_composite_function \
2643 pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
2644 FLAG_DST_READWRITE, \
2645 8, /* number of pixels, processed in a single block */ \
2646 5, /* prefetch distance */ \
2647 default_init_need_all_regs, \
2648 default_cleanup_need_all_regs, \
2649 pixman_composite_add_0565_8_0565_process_pixblock_head, \
2650 pixman_composite_add_0565_8_0565_process_pixblock_tail, \
2651 pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
2652 28, /* dst_w_basereg */ \
2653 10, /* dst_r_basereg */ \
2654 8, /* src_basereg */ \
2655 15 /* mask_basereg */
2657 /******************************************************************************/
2659 .macro pixman_composite_out_reverse_8_0565_process_pixblock_head
2660 /* mask is in d15 */
2661 convert_0565_to_x888 q5, d6, d5, d4
2662 /* destination pixel data is in {d4, d5, d6, xx} */
2663 vmvn.8 d24, d15 /* get inverted alpha */
2664 /* now do alpha blending */
2665 vmull.u8 q8, d24, d4
2666 vmull.u8 q9, d24, d5
2667 vmull.u8 q10, d24, d6
2670 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
2671 vrshr.u16 q14, q8, #8
2672 vrshr.u16 q15, q9, #8
2673 vrshr.u16 q12, q10, #8
2674 vraddhn.u16 d0, q14, q8
2675 vraddhn.u16 d1, q15, q9
2676 vraddhn.u16 d2, q12, q10
2677 /* 32bpp result is in {d0, d1, d2, XX} */
2678 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2681 /* TODO: expand macros and do better instructions scheduling */
2682 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
2684 pixman_composite_out_reverse_8_0565_process_pixblock_tail
2685 vld1.16 {d10, d11}, [DST_R, :128]!
2687 pixman_composite_out_reverse_8_0565_process_pixblock_head
2688 vst1.16 {d28, d29}, [DST_W, :128]!
2691 generate_composite_function \
2692 pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
2693 FLAG_DST_READWRITE, \
2694 8, /* number of pixels, processed in a single block */ \
2695 5, /* prefetch distance */ \
2696 default_init_need_all_regs, \
2697 default_cleanup_need_all_regs, \
2698 pixman_composite_out_reverse_8_0565_process_pixblock_head, \
2699 pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
2700 pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
2701 28, /* dst_w_basereg */ \
2702 10, /* dst_r_basereg */ \
2703 15, /* src_basereg */ \
2704 0 /* mask_basereg */
2706 /******************************************************************************/
2708 .macro pixman_composite_out_reverse_8_8888_process_pixblock_head
2710 /* destination pixel data is in {d4, d5, d6, d7} */
2711 vmvn.8 d1, d0 /* get inverted alpha */
2712 /* now do alpha blending */
2715 vmull.u8 q10, d1, d6
2716 vmull.u8 q11, d1, d7
2719 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
2720 vrshr.u16 q14, q8, #8
2721 vrshr.u16 q15, q9, #8
2722 vrshr.u16 q12, q10, #8
2723 vrshr.u16 q13, q11, #8
2724 vraddhn.u16 d28, q14, q8
2725 vraddhn.u16 d29, q15, q9
2726 vraddhn.u16 d30, q12, q10
2727 vraddhn.u16 d31, q13, q11
2728 /* 32bpp result is in {d28, d29, d30, d31} */
2731 /* TODO: expand macros and do better instructions scheduling */
2732 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
2734 pixman_composite_out_reverse_8_8888_process_pixblock_tail
2735 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2737 pixman_composite_out_reverse_8_8888_process_pixblock_head
2738 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2741 generate_composite_function \
2742 pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
2743 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2744 8, /* number of pixels, processed in a single block */ \
2745 5, /* prefetch distance */ \
2748 pixman_composite_out_reverse_8_8888_process_pixblock_head, \
2749 pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
2750 pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
2751 28, /* dst_w_basereg */ \
2752 4, /* dst_r_basereg */ \
2753 0, /* src_basereg */ \
2754 0 /* mask_basereg */
2756 /******************************************************************************/
2758 generate_composite_function_nearest_scanline \
2759 pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
2760 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2761 8, /* number of pixels, processed in a single block */ \
2764 pixman_composite_over_8888_8888_process_pixblock_head, \
2765 pixman_composite_over_8888_8888_process_pixblock_tail, \
2766 pixman_composite_over_8888_8888_process_pixblock_tail_head
2768 generate_composite_function_nearest_scanline \
2769 pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
2770 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2771 8, /* number of pixels, processed in a single block */ \
2774 pixman_composite_over_8888_0565_process_pixblock_head, \
2775 pixman_composite_over_8888_0565_process_pixblock_tail, \
2776 pixman_composite_over_8888_0565_process_pixblock_tail_head, \
2777 28, /* dst_w_basereg */ \
2778 4, /* dst_r_basereg */ \
2779 0, /* src_basereg */ \
2780 24 /* mask_basereg */
2782 generate_composite_function_nearest_scanline \
2783 pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
2784 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2785 8, /* number of pixels, processed in a single block */ \
2788 pixman_composite_src_8888_0565_process_pixblock_head, \
2789 pixman_composite_src_8888_0565_process_pixblock_tail, \
2790 pixman_composite_src_8888_0565_process_pixblock_tail_head
2792 generate_composite_function_nearest_scanline \
2793 pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
2794 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2795 8, /* number of pixels, processed in a single block */ \
2798 pixman_composite_src_0565_8888_process_pixblock_head, \
2799 pixman_composite_src_0565_8888_process_pixblock_tail, \
2800 pixman_composite_src_0565_8888_process_pixblock_tail_head
2802 generate_composite_function_nearest_scanline \
2803 pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
2804 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2805 8, /* number of pixels, processed in a single block */ \
2806 default_init_need_all_regs, \
2807 default_cleanup_need_all_regs, \
2808 pixman_composite_over_8888_8_0565_process_pixblock_head, \
2809 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
2810 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
2811 28, /* dst_w_basereg */ \
2812 4, /* dst_r_basereg */ \
2813 8, /* src_basereg */ \
2814 24 /* mask_basereg */
2816 generate_composite_function_nearest_scanline \
2817 pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
2818 FLAG_DST_READWRITE, \
2819 8, /* number of pixels, processed in a single block */ \
2820 default_init_need_all_regs, \
2821 default_cleanup_need_all_regs, \
2822 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2823 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2824 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2825 28, /* dst_w_basereg */ \
2826 10, /* dst_r_basereg */ \
2827 8, /* src_basereg */ \
2828 15 /* mask_basereg */
2830 /******************************************************************************/
2832 /* Supplementary macro for setting function attributes */
2833 .macro pixman_asm_function fname
2838 .type fname, %function
2844 * Bilinear scaling support code which tries to provide pixel fetching, color
2845 * format conversion, and interpolation as separate macros which can be used
2846 * as the basic building blocks for constructing bilinear scanline functions.
2849 .macro bilinear_load_8888 reg1, reg2, tmp
2850 mov TMP1, X, asr #16
2852 add TMP1, TOP, TMP1, asl #2
2853 vld1.32 {reg1}, [TMP1], STRIDE
2854 vld1.32 {reg2}, [TMP1]
2857 .macro bilinear_load_0565 reg1, reg2, tmp
2858 mov TMP1, X, asr #16
2860 add TMP1, TOP, TMP1, asl #1
2861 vld1.32 {reg2[0]}, [TMP1], STRIDE
2862 vld1.32 {reg2[1]}, [TMP1]
2863 convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
2866 .macro bilinear_load_and_vertical_interpolate_two_8888 \
2867 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
2869 bilinear_load_8888 reg1, reg2, tmp1
2870 vmull.u8 acc1, reg1, d28
2871 vmlal.u8 acc1, reg2, d29
2872 bilinear_load_8888 reg3, reg4, tmp2
2873 vmull.u8 acc2, reg3, d28
2874 vmlal.u8 acc2, reg4, d29
2877 .macro bilinear_load_and_vertical_interpolate_four_8888 \
2878 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2879 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2881 bilinear_load_and_vertical_interpolate_two_8888 \
2882 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
2883 bilinear_load_and_vertical_interpolate_two_8888 \
2884 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2887 .macro bilinear_load_and_vertical_interpolate_two_0565 \
2888 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
2890 mov TMP1, X, asr #16
2892 add TMP1, TOP, TMP1, asl #1
2893 mov TMP2, X, asr #16
2895 add TMP2, TOP, TMP2, asl #1
2896 vld1.32 {acc2lo[0]}, [TMP1], STRIDE
2897 vld1.32 {acc2hi[0]}, [TMP2], STRIDE
2898 vld1.32 {acc2lo[1]}, [TMP1]
2899 vld1.32 {acc2hi[1]}, [TMP2]
2900 convert_0565_to_x888 acc2, reg3, reg2, reg1
2905 vmull.u8 acc1, reg1, d28
2906 vmlal.u8 acc1, reg2, d29
2907 vmull.u8 acc2, reg3, d28
2908 vmlal.u8 acc2, reg4, d29
2911 .macro bilinear_load_and_vertical_interpolate_four_0565 \
2912 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2913 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2915 mov TMP1, X, asr #16
2917 add TMP1, TOP, TMP1, asl #1
2918 mov TMP2, X, asr #16
2920 add TMP2, TOP, TMP2, asl #1
2921 vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
2922 vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
2923 vld1.32 {xacc2lo[1]}, [TMP1]
2924 vld1.32 {xacc2hi[1]}, [TMP2]
2925 convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
2926 mov TMP1, X, asr #16
2928 add TMP1, TOP, TMP1, asl #1
2929 mov TMP2, X, asr #16
2931 add TMP2, TOP, TMP2, asl #1
2932 vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
2933 vzip.u8 xreg1, xreg3
2934 vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
2935 vzip.u8 xreg2, xreg4
2936 vld1.32 {yacc2lo[1]}, [TMP1]
2937 vzip.u8 xreg3, xreg4
2938 vld1.32 {yacc2hi[1]}, [TMP2]
2939 vzip.u8 xreg1, xreg2
2940 convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
2941 vmull.u8 xacc1, xreg1, d28
2942 vzip.u8 yreg1, yreg3
2943 vmlal.u8 xacc1, xreg2, d29
2944 vzip.u8 yreg2, yreg4
2945 vmull.u8 xacc2, xreg3, d28
2946 vzip.u8 yreg3, yreg4
2947 vmlal.u8 xacc2, xreg4, d29
2948 vzip.u8 yreg1, yreg2
2949 vmull.u8 yacc1, yreg1, d28
2950 vmlal.u8 yacc1, yreg2, d29
2951 vmull.u8 yacc2, yreg3, d28
2952 vmlal.u8 yacc2, yreg4, d29
2955 .macro bilinear_store_8888 numpix, tmp1, tmp2
2957 vst1.32 {d0, d1}, [OUT, :128]!
2959 vst1.32 {d0}, [OUT, :64]!
2961 vst1.32 {d0[0]}, [OUT, :32]!
2963 .error bilinear_store_8888 numpix is unsupported
2967 .macro bilinear_store_0565 numpix, tmp1, tmp2
2972 convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
2974 vst1.16 {d2}, [OUT, :64]!
2976 vst1.32 {d2[0]}, [OUT, :32]!
2978 vst1.16 {d2[0]}, [OUT, :16]!
2980 .error bilinear_store_0565 numpix is unsupported
2984 .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
2985 bilinear_load_&src_fmt d0, d1, d2
2986 vmull.u8 q1, d0, d28
2987 vmlal.u8 q1, d1, d29
2988 /* 5 cycles bubble */
2989 vshll.u16 q0, d2, #8
2990 vmlsl.u16 q0, d2, d30
2991 vmlal.u16 q0, d3, d30
2992 /* 5 cycles bubble */
2993 vshrn.u32 d0, q0, #16
2994 /* 3 cycles bubble */
2996 /* 1 cycle bubble */
2997 bilinear_store_&dst_fmt 1, q2, q3
3000 .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
3001 bilinear_load_and_vertical_interpolate_two_&src_fmt \
3002 q1, q11, d0, d1, d20, d21, d22, d23
3003 vshll.u16 q0, d2, #8
3004 vmlsl.u16 q0, d2, d30
3005 vmlal.u16 q0, d3, d30
3006 vshll.u16 q10, d22, #8
3007 vmlsl.u16 q10, d22, d31
3008 vmlal.u16 q10, d23, d31
3009 vshrn.u32 d0, q0, #16
3010 vshrn.u32 d1, q10, #16
3011 vshr.u16 q15, q12, #8
3012 vadd.u16 q12, q12, q13
3014 bilinear_store_&dst_fmt 2, q2, q3
3017 .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
3018 bilinear_load_and_vertical_interpolate_four_&src_fmt \
3019 q1, q11, d0, d1, d20, d21, d22, d23 \
3020 q3, q9, d4, d5, d16, d17, d18, d19
3022 sub TMP1, TMP1, STRIDE
3023 vshll.u16 q0, d2, #8
3024 vmlsl.u16 q0, d2, d30
3025 vmlal.u16 q0, d3, d30
3026 vshll.u16 q10, d22, #8
3027 vmlsl.u16 q10, d22, d31
3028 vmlal.u16 q10, d23, d31
3029 vshr.u16 q15, q12, #8
3030 vshll.u16 q2, d6, #8
3031 vmlsl.u16 q2, d6, d30
3032 vmlal.u16 q2, d7, d30
3033 vshll.u16 q8, d18, #8
3035 vmlsl.u16 q8, d18, d31
3036 vmlal.u16 q8, d19, d31
3037 vadd.u16 q12, q12, q13
3038 vshrn.u32 d0, q0, #16
3039 vshrn.u32 d1, q10, #16
3040 vshrn.u32 d4, q2, #16
3041 vshrn.u32 d5, q8, #16
3042 vshr.u16 q15, q12, #8
3045 vadd.u16 q12, q12, q13
3046 bilinear_store_&dst_fmt 4, q2, q3
3049 .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
3050 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
3051 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
3053 bilinear_interpolate_four_pixels src_fmt, dst_fmt
3057 .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
3058 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
3059 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
3063 .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3064 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
3065 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
3067 bilinear_interpolate_four_pixels src_fmt, dst_fmt
3071 .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
3072 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
3073 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
3075 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
3076 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3080 .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
3081 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
3082 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
3084 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
3088 .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
3089 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
3090 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
3092 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3093 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3097 .set BILINEAR_FLAG_UNROLL_4, 0
3098 .set BILINEAR_FLAG_UNROLL_8, 1
3099 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
3102 * Main template macro for generating NEON optimized bilinear scanline
3105 * Bilinear scanline scaler macro template uses the following arguments:
3106 * fname - name of the function to generate
3107 * src_fmt - source color format (8888 or 0565)
3108 * dst_fmt - destination color format (8888 or 0565)
3109 * bpp_shift - (1 << bpp_shift) is the size of source pixel in bytes
3110 * prefetch_distance - prefetch in the source image by that many
3114 .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
3115 src_bpp_shift, dst_bpp_shift, \
3116 prefetch_distance, flags
3118 pixman_asm_function fname
3135 push {r4, r5, r6, r7, r8, r9}
3136 mov PF_OFFS, #prefetch_distance
3137 ldmia ip, {WB, X, UX, WIDTH}
3138 mul PF_OFFS, PF_OFFS, UX
3140 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
3144 sub STRIDE, BOTTOM, TOP
3154 vadd.u16 d25, d25, d26
3156 /* ensure good destination alignment */
3159 tst OUT, #(1 << dst_bpp_shift)
3161 vshr.u16 q15, q12, #8
3162 vadd.u16 q12, q12, q13
3163 bilinear_interpolate_last_pixel src_fmt, dst_fmt
3164 sub WIDTH, WIDTH, #1
3166 vadd.u16 q13, q13, q13
3167 vshr.u16 q15, q12, #8
3168 vadd.u16 q12, q12, q13
3172 tst OUT, #(1 << (dst_bpp_shift + 1))
3174 bilinear_interpolate_two_pixels src_fmt, dst_fmt
3175 sub WIDTH, WIDTH, #2
3177 .if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
3178 /*********** 8 pixels per iteration *****************/
3181 tst OUT, #(1 << (dst_bpp_shift + 2))
3183 bilinear_interpolate_four_pixels src_fmt, dst_fmt
3184 sub WIDTH, WIDTH, #4
3186 subs WIDTH, WIDTH, #8
3188 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
3189 bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
3190 subs WIDTH, WIDTH, #8
3193 bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
3194 subs WIDTH, WIDTH, #8
3197 bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
3201 bilinear_interpolate_four_pixels src_fmt, dst_fmt
3204 /*********** 4 pixels per iteration *****************/
3205 subs WIDTH, WIDTH, #4
3207 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
3208 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
3209 subs WIDTH, WIDTH, #4
3212 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3213 subs WIDTH, WIDTH, #4
3216 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
3218 /****************************************************/
3220 /* handle the remaining trailing pixels */
3223 bilinear_interpolate_two_pixels src_fmt, dst_fmt
3227 bilinear_interpolate_last_pixel src_fmt, dst_fmt
3229 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
3232 pop {r4, r5, r6, r7, r8, r9}
3252 /*****************************************************************************/
3254 .set have_bilinear_interpolate_four_pixels_8888_8888, 1
3256 .macro bilinear_interpolate_four_pixels_8888_8888_head
3257 mov TMP1, X, asr #16
3259 add TMP1, TOP, TMP1, asl #2
3260 mov TMP2, X, asr #16
3262 add TMP2, TOP, TMP2, asl #2
3264 vld1.32 {d22}, [TMP1], STRIDE
3265 vld1.32 {d23}, [TMP1]
3266 mov TMP3, X, asr #16
3268 add TMP3, TOP, TMP3, asl #2
3269 vmull.u8 q8, d22, d28
3270 vmlal.u8 q8, d23, d29
3272 vld1.32 {d22}, [TMP2], STRIDE
3273 vld1.32 {d23}, [TMP2]
3274 mov TMP4, X, asr #16
3276 add TMP4, TOP, TMP4, asl #2
3277 vmull.u8 q9, d22, d28
3278 vmlal.u8 q9, d23, d29
3280 vld1.32 {d22}, [TMP3], STRIDE
3281 vld1.32 {d23}, [TMP3]
3282 vmull.u8 q10, d22, d28
3283 vmlal.u8 q10, d23, d29
3285 vshll.u16 q0, d16, #8
3286 vmlsl.u16 q0, d16, d30
3287 vmlal.u16 q0, d17, d30
3290 vld1.32 {d16}, [TMP4], STRIDE
3291 vld1.32 {d17}, [TMP4]
3293 vmull.u8 q11, d16, d28
3294 vmlal.u8 q11, d17, d29
3296 vshll.u16 q1, d18, #8
3297 vmlsl.u16 q1, d18, d31
3300 .macro bilinear_interpolate_four_pixels_8888_8888_tail
3301 vmlal.u16 q1, d19, d31
3302 vshr.u16 q15, q12, #8
3303 vshll.u16 q2, d20, #8
3304 vmlsl.u16 q2, d20, d30
3305 vmlal.u16 q2, d21, d30
3306 vshll.u16 q3, d22, #8
3307 vmlsl.u16 q3, d22, d31
3308 vmlal.u16 q3, d23, d31
3309 vadd.u16 q12, q12, q13
3310 vshrn.u32 d0, q0, #16
3311 vshrn.u32 d1, q1, #16
3312 vshrn.u32 d4, q2, #16
3313 vshr.u16 q15, q12, #8
3314 vshrn.u32 d5, q3, #16
3317 vadd.u16 q12, q12, q13
3318 vst1.32 {d6, d7}, [OUT, :128]!
3321 .macro bilinear_interpolate_four_pixels_8888_8888_tail_head
3322 mov TMP1, X, asr #16
3324 add TMP1, TOP, TMP1, asl #2
3325 mov TMP2, X, asr #16
3327 add TMP2, TOP, TMP2, asl #2
3328 vmlal.u16 q1, d19, d31
3329 vshr.u16 q15, q12, #8
3330 vshll.u16 q2, d20, #8
3331 vmlsl.u16 q2, d20, d30
3332 vmlal.u16 q2, d21, d30
3333 vshll.u16 q3, d22, #8
3334 vld1.32 {d20}, [TMP1], STRIDE
3335 vmlsl.u16 q3, d22, d31
3336 vmlal.u16 q3, d23, d31
3337 vld1.32 {d21}, [TMP1]
3338 vmull.u8 q8, d20, d28
3339 vmlal.u8 q8, d21, d29
3340 vshrn.u32 d0, q0, #16
3341 vshrn.u32 d1, q1, #16
3342 vshrn.u32 d4, q2, #16
3343 vld1.32 {d22}, [TMP2], STRIDE
3344 vshrn.u32 d5, q3, #16
3345 vadd.u16 q12, q12, q13
3346 vld1.32 {d23}, [TMP2]
3347 vmull.u8 q9, d22, d28
3348 mov TMP3, X, asr #16
3350 add TMP3, TOP, TMP3, asl #2
3351 mov TMP4, X, asr #16
3353 add TMP4, TOP, TMP4, asl #2
3354 vmlal.u8 q9, d23, d29
3355 vld1.32 {d22}, [TMP3], STRIDE
3356 vshr.u16 q15, q12, #8
3357 vld1.32 {d23}, [TMP3]
3358 vmull.u8 q10, d22, d28
3359 vmlal.u8 q10, d23, d29
3361 vshll.u16 q0, d16, #8
3363 vmlsl.u16 q0, d16, d30
3364 vmlal.u16 q0, d17, d30
3366 vld1.32 {d16}, [TMP4], STRIDE
3367 vadd.u16 q12, q12, q13
3368 vld1.32 {d17}, [TMP4]
3370 vmull.u8 q11, d16, d28
3371 vmlal.u8 q11, d17, d29
3372 vst1.32 {d6, d7}, [OUT, :128]!
3373 vshll.u16 q1, d18, #8
3374 vmlsl.u16 q1, d18, d31
3377 /*****************************************************************************/
3379 .set have_bilinear_interpolate_eight_pixels_8888_0565, 1
3381 .macro bilinear_interpolate_eight_pixels_8888_0565_head
3382 mov TMP1, X, asr #16
3384 add TMP1, TOP, TMP1, asl #2
3385 mov TMP2, X, asr #16
3387 add TMP2, TOP, TMP2, asl #2
3388 vld1.32 {d20}, [TMP1], STRIDE
3389 vld1.32 {d21}, [TMP1]
3390 vmull.u8 q8, d20, d28
3391 vmlal.u8 q8, d21, d29
3392 vld1.32 {d22}, [TMP2], STRIDE
3393 vld1.32 {d23}, [TMP2]
3394 vmull.u8 q9, d22, d28
3395 mov TMP3, X, asr #16
3397 add TMP3, TOP, TMP3, asl #2
3398 mov TMP4, X, asr #16
3400 add TMP4, TOP, TMP4, asl #2
3401 vmlal.u8 q9, d23, d29
3402 vld1.32 {d22}, [TMP3], STRIDE
3403 vld1.32 {d23}, [TMP3]
3404 vmull.u8 q10, d22, d28
3405 vmlal.u8 q10, d23, d29
3406 vshll.u16 q0, d16, #8
3407 vmlsl.u16 q0, d16, d30
3408 vmlal.u16 q0, d17, d30
3410 vld1.32 {d16}, [TMP4], STRIDE
3411 vld1.32 {d17}, [TMP4]
3413 vmull.u8 q11, d16, d28
3414 vmlal.u8 q11, d17, d29
3415 vshll.u16 q1, d18, #8
3416 vmlsl.u16 q1, d18, d31
3418 mov TMP1, X, asr #16
3420 add TMP1, TOP, TMP1, asl #2
3421 mov TMP2, X, asr #16
3423 add TMP2, TOP, TMP2, asl #2
3424 vmlal.u16 q1, d19, d31
3425 vshr.u16 q15, q12, #8
3426 vshll.u16 q2, d20, #8
3427 vmlsl.u16 q2, d20, d30
3428 vmlal.u16 q2, d21, d30
3429 vshll.u16 q3, d22, #8
3430 vld1.32 {d20}, [TMP1], STRIDE
3431 vmlsl.u16 q3, d22, d31
3432 vmlal.u16 q3, d23, d31
3433 vld1.32 {d21}, [TMP1]
3434 vmull.u8 q8, d20, d28
3435 vmlal.u8 q8, d21, d29
3436 vshrn.u32 d0, q0, #16
3437 vshrn.u32 d1, q1, #16
3438 vshrn.u32 d4, q2, #16
3439 vld1.32 {d22}, [TMP2], STRIDE
3440 vshrn.u32 d5, q3, #16
3441 vadd.u16 q12, q12, q13
3442 vld1.32 {d23}, [TMP2]
3443 vmull.u8 q9, d22, d28
3444 mov TMP3, X, asr #16
3446 add TMP3, TOP, TMP3, asl #2
3447 mov TMP4, X, asr #16
3449 add TMP4, TOP, TMP4, asl #2
3450 vmlal.u8 q9, d23, d29
3451 vld1.32 {d22}, [TMP3], STRIDE
3452 vshr.u16 q15, q12, #8
3453 vld1.32 {d23}, [TMP3]
3454 vmull.u8 q10, d22, d28
3455 vmlal.u8 q10, d23, d29
3457 vshll.u16 q0, d16, #8
3459 vmlsl.u16 q0, d16, d30
3460 vmlal.u16 q0, d17, d30
3462 vld1.32 {d16}, [TMP4], STRIDE
3463 vadd.u16 q12, q12, q13
3464 vld1.32 {d17}, [TMP4]
3466 vmull.u8 q11, d16, d28
3467 vmlal.u8 q11, d17, d29
3468 vshll.u16 q1, d18, #8
3469 vmlsl.u16 q1, d18, d31
3472 .macro bilinear_interpolate_eight_pixels_8888_0565_tail
3473 vmlal.u16 q1, d19, d31
3474 vshr.u16 q15, q12, #8
3475 vshll.u16 q2, d20, #8
3476 vmlsl.u16 q2, d20, d30
3477 vmlal.u16 q2, d21, d30
3478 vshll.u16 q3, d22, #8
3479 vmlsl.u16 q3, d22, d31
3480 vmlal.u16 q3, d23, d31
3481 vadd.u16 q12, q12, q13
3482 vshrn.u32 d0, q0, #16
3483 vshrn.u32 d1, q1, #16
3484 vshrn.u32 d4, q2, #16
3485 vshr.u16 q15, q12, #8
3486 vshrn.u32 d5, q3, #16
3489 vadd.u16 q12, q12, q13
3496 vshll.u8 q5, d10, #8
3499 vsri.u16 q5, q7, #11
3500 vst1.32 {d10, d11}, [OUT, :128]!
3503 .macro bilinear_interpolate_eight_pixels_8888_0565_tail_head
3504 mov TMP1, X, asr #16
3506 add TMP1, TOP, TMP1, asl #2
3507 mov TMP2, X, asr #16
3509 add TMP2, TOP, TMP2, asl #2
3510 vmlal.u16 q1, d19, d31
3511 vshr.u16 q15, q12, #8
3513 vshll.u16 q2, d20, #8
3514 vmlsl.u16 q2, d20, d30
3515 vmlal.u16 q2, d21, d30
3516 vshll.u16 q3, d22, #8
3517 vld1.32 {d20}, [TMP1], STRIDE
3518 vmlsl.u16 q3, d22, d31
3519 vmlal.u16 q3, d23, d31
3520 vld1.32 {d21}, [TMP1]
3521 vmull.u8 q8, d20, d28
3522 vmlal.u8 q8, d21, d29
3523 vshrn.u32 d0, q0, #16
3524 vshrn.u32 d1, q1, #16
3525 vshrn.u32 d4, q2, #16
3526 vld1.32 {d22}, [TMP2], STRIDE
3527 vshrn.u32 d5, q3, #16
3528 vadd.u16 q12, q12, q13
3529 vld1.32 {d23}, [TMP2]
3530 vmull.u8 q9, d22, d28
3531 mov TMP3, X, asr #16
3533 add TMP3, TOP, TMP3, asl #2
3534 mov TMP4, X, asr #16
3536 add TMP4, TOP, TMP4, asl #2
3537 vmlal.u8 q9, d23, d29
3538 vld1.32 {d22}, [TMP3], STRIDE
3539 vshr.u16 q15, q12, #8
3540 vld1.32 {d23}, [TMP3]
3541 vmull.u8 q10, d22, d28
3542 vmlal.u8 q10, d23, d29
3544 vshll.u16 q0, d16, #8
3546 vmlsl.u16 q0, d16, d30
3547 vmlal.u16 q0, d17, d30
3549 vld1.32 {d16}, [TMP4], STRIDE
3550 vadd.u16 q12, q12, q13
3551 vld1.32 {d17}, [TMP4]
3553 vmull.u8 q11, d16, d28
3554 vmlal.u8 q11, d17, d29
3556 vshll.u16 q1, d18, #8
3557 vmlsl.u16 q1, d18, d31
3559 mov TMP1, X, asr #16
3561 add TMP1, TOP, TMP1, asl #2
3562 mov TMP2, X, asr #16
3564 add TMP2, TOP, TMP2, asl #2
3565 vmlal.u16 q1, d19, d31
3567 vshr.u16 q15, q12, #8
3568 vshll.u16 q2, d20, #8
3570 vmlsl.u16 q2, d20, d30
3571 vmlal.u16 q2, d21, d30
3572 vshll.u16 q3, d22, #8
3573 vld1.32 {d20}, [TMP1], STRIDE
3574 vmlsl.u16 q3, d22, d31
3575 vmlal.u16 q3, d23, d31
3576 vld1.32 {d21}, [TMP1]
3577 vmull.u8 q8, d20, d28
3578 vmlal.u8 q8, d21, d29
3580 vshll.u8 q5, d10, #8
3582 vshrn.u32 d0, q0, #16
3584 vshrn.u32 d1, q1, #16
3585 vsri.u16 q5, q7, #11
3586 vshrn.u32 d4, q2, #16
3587 vld1.32 {d22}, [TMP2], STRIDE
3588 vshrn.u32 d5, q3, #16
3589 vadd.u16 q12, q12, q13
3590 vld1.32 {d23}, [TMP2]
3591 vmull.u8 q9, d22, d28
3592 mov TMP3, X, asr #16
3594 add TMP3, TOP, TMP3, asl #2
3595 mov TMP4, X, asr #16
3597 add TMP4, TOP, TMP4, asl #2
3598 vmlal.u8 q9, d23, d29
3599 vld1.32 {d22}, [TMP3], STRIDE
3600 vshr.u16 q15, q12, #8
3601 vld1.32 {d23}, [TMP3]
3602 vmull.u8 q10, d22, d28
3603 vmlal.u8 q10, d23, d29
3605 vshll.u16 q0, d16, #8
3607 vmlsl.u16 q0, d16, d30
3608 vmlal.u16 q0, d17, d30
3610 vld1.32 {d16}, [TMP4], STRIDE
3611 vadd.u16 q12, q12, q13
3612 vld1.32 {d17}, [TMP4]
3614 vmull.u8 q11, d16, d28
3615 vmlal.u8 q11, d17, d29
3616 vshll.u16 q1, d18, #8
3617 vst1.32 {d10, d11}, [OUT, :128]!
3618 vmlsl.u16 q1, d18, d31
3620 /*****************************************************************************/
3622 generate_bilinear_scanline_func \
3623 pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
3624 2, 2, 28, BILINEAR_FLAG_UNROLL_4
3626 generate_bilinear_scanline_func \
3627 pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
3628 2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
3630 generate_bilinear_scanline_func \
3631 pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
3632 1, 2, 28, BILINEAR_FLAG_UNROLL_4
3634 generate_bilinear_scanline_func \
3635 pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
3636 1, 1, 28, BILINEAR_FLAG_UNROLL_4