2 * Copyright © 2009 Nokia Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
27 * This file contains implementations of NEON optimized pixel processing
28 * functions. There is no full and detailed tutorial, but some functions
29 * (those which are exposing some new or interesting features) are
30 * extensively commented and can be used as examples.
32 * You may want to have a look at the comments for following functions:
33 * - pixman_composite_over_8888_0565_asm_neon
34 * - pixman_composite_over_n_8_0565_asm_neon
37 /* Prevent the stack from becoming executable for no reason... */
38 #if defined(__linux__) && defined(__ELF__)
39 .section .note.GNU-stack,"",%progbits
46 .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
47 .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
51 #include "pixman-arm-neon-asm.h"
53 /* Global configuration options and preferences */
56 * The code can optionally make use of unaligned memory accesses to improve
57 * performance of handling leading/trailing pixels for each scanline.
58 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
59 * example in linux if unaligned memory accesses are not configured to
60 * generate.exceptions.
62 .set RESPECT_STRICT_ALIGNMENT, 1
65 * Set default prefetch type. There is a choice between the following options:
67 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
68 * as NOP to workaround some HW bugs or for whatever other reason)
70 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
71 * advanced prefetch intruduces heavy overhead)
73 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
74 * which can run ARM and NEON instructions simultaneously so that extra ARM
75 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
77 * Note: some types of function can't support advanced prefetch and fallback
78 * to simple one (those which handle 24bpp pixels)
80 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
82 /* Prefetch distance in pixels for simple prefetch */
83 .set PREFETCH_DISTANCE_SIMPLE, 64
86 * Implementation of pixman_composite_over_8888_0565_asm_neon
88 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
89 * performs OVER compositing operation. Function fast_composite_over_8888_0565
90 * from pixman-fast-path.c does the same in C and can be used as a reference.
92 * First we need to have some NEON assembly code which can do the actual
93 * operation on the pixels and provide it to the template macro.
95 * Template macro quite conveniently takes care of emitting all the necessary
96 * code for memory reading and writing (including quite tricky cases of
97 * handling unaligned leading/trailing pixels), so we only need to deal with
98 * the data in NEON registers.
100 * NEON registers allocation in general is recommented to be the following:
101 * d0, d1, d2, d3 - contain loaded source pixel data
102 * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed)
103 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
104 * d28, d29, d30, d31 - place for storing the result (destination pixels)
106 * As can be seen above, four 64-bit NEON registers are used for keeping
107 * intermediate pixel data and up to 8 pixels can be processed in one step
108 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
110 * This particular function uses the following registers allocation:
111 * d0, d1, d2, d3 - contain loaded source pixel data
112 * d4, d5 - contain loaded destination pixels (they are needed)
113 * d28, d29 - place for storing the result (destination pixels)
117 * Step one. We need to have some code to do some arithmetics on pixel data.
118 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
119 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
120 * perform all the needed calculations and write the result to {d28, d29}.
121 * The rationale for having two macros and not just one will be explained
122 * later. In practice, any single monolitic function which does the work can
123 * be split into two parts in any arbitrary way without affecting correctness.
125 * There is one special trick here too. Common template macro can optionally
126 * make our life a bit easier by doing R, G, B, A color components
127 * deinterleaving for 32bpp pixel formats (and this feature is used in
128 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
129 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
130 * actually use d0 register for blue channel (a vector of eight 8-bit
131 * values), d1 register for green, d2 for red and d3 for alpha. This
132 * simple conversion can be also done with a few NEON instructions:
134 * Packed to planar conversion:
140 * Planar to packed conversion:
146 * But pixel can be loaded directly in planar format using VLD4.8 NEON
147 * instruction. It is 1 cycle slower than VLD1.32, so this is not always
148 * desirable, that's why deinterleaving is optional.
150 * But anyway, here is the code:
152 .macro pixman_composite_over_8888_0565_process_pixblock_head
153 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
154 and put data into d6 - red, d7 - green, d30 - blue */
159 vmvn.8 d3, d3 /* invert source alpha */
161 vshrn.u16 d30, q2, #2
162 /* now do alpha blending, storing results in 8-bit planar format
163 into d16 - red, d19 - green, d18 - blue */
166 vmull.u8 q12, d3, d30
167 vrshr.u16 q13, q10, #8
168 vrshr.u16 q3, q11, #8
169 vrshr.u16 q15, q12, #8
170 vraddhn.u16 d20, q10, q13
171 vraddhn.u16 d23, q11, q3
172 vraddhn.u16 d22, q12, q15
175 .macro pixman_composite_over_8888_0565_process_pixblock_tail
176 /* ... continue alpha blending */
177 vqadd.u8 d16, d2, d20
179 /* convert the result to r5g6b5 and store it into {d28, d29} */
180 vshll.u8 q14, d16, #8
184 vsri.u16 q14, q9, #11
188 * OK, now we got almost everything that we need. Using the above two
189 * macros, the work can be done right. But now we want to optimize
190 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
191 * a lot from good code scheduling and software pipelining.
193 * Let's construct some code, which will run in the core main loop.
194 * Some pseudo-code of the main loop will look like this:
202 * It may look a bit weird, but this setup allows to hide instruction
203 * latencies better and also utilize dual-issue capability more
204 * efficiently (make pairs of load-store and ALU instructions).
206 * So what we need now is a '*_tail_head' macro, which will be used
207 * in the core main loop. A trivial straightforward implementation
208 * of this macro would look like this:
210 * pixman_composite_over_8888_0565_process_pixblock_tail
211 * vst1.16 {d28, d29}, [DST_W, :128]!
212 * vld1.16 {d4, d5}, [DST_R, :128]!
213 * vld4.32 {d0, d1, d2, d3}, [SRC]!
214 * pixman_composite_over_8888_0565_process_pixblock_head
217 * Now it also got some VLD/VST instructions. We simply can't move from
218 * processing one block of pixels to the other one with just arithmetics.
219 * The previously processed data needs to be written to memory and new
220 * data needs to be fetched. Fortunately, this main loop does not deal
221 * with partial leading/trailing pixels and can load/store a full block
222 * of pixels in a bulk. Additionally, destination buffer is already
223 * 16 bytes aligned here (which is good for performance).
225 * New things here are DST_R, DST_W, SRC and MASK identifiers. These
226 * are the aliases for ARM registers which are used as pointers for
227 * accessing data. We maintain separate pointers for reading and writing
228 * destination buffer (DST_R and DST_W).
230 * Another new thing is 'cache_preload' macro. It is used for prefetching
231 * data into CPU L2 cache and improve performance when dealing with large
232 * images which are far larger than cache size. It uses one argument
233 * (actually two, but they need to be the same here) - number of pixels
234 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
235 * details about this macro. Moreover, if good performance is needed
236 * the code from this macro needs to be copied into '*_tail_head' macro
237 * and mixed with the rest of code for optimal instructions scheduling.
238 * We are actually doing it below.
240 * Now after all the explanations, here is the optimized code.
241 * Different instruction streams (originaling from '*_head', '*_tail'
242 * and 'cache_preload' macro) use different indentation levels for
243 * better readability. Actually taking the code from one of these
244 * indentation levels and ignoring a few VLD/VST instructions would
245 * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
251 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
252 vqadd.u8 d16, d2, d20
253 vld1.16 {d4, d5}, [DST_R, :128]!
259 vshll.u8 q14, d16, #8
260 PF add PF_X, PF_X, #8
264 PF addne PF_X, PF_X, #8
266 PF subne PF_CTL, PF_CTL, #1
268 vshrn.u16 d30, q2, #2
270 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
272 vmull.u8 q12, d3, d30
273 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
277 vrshr.u16 q13, q10, #8
278 PF subge PF_X, PF_X, ORIG_W
279 vrshr.u16 q3, q11, #8
280 vrshr.u16 q15, q12, #8
281 PF subges PF_CTL, PF_CTL, #0x10
282 vsri.u16 q14, q9, #11
283 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
284 vraddhn.u16 d20, q10, q13
285 vraddhn.u16 d23, q11, q3
286 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
287 vraddhn.u16 d22, q12, q15
288 vst1.16 {d28, d29}, [DST_W, :128]!
293 /* If we did not care much about the performance, we would just use this... */
294 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
295 pixman_composite_over_8888_0565_process_pixblock_tail
296 vst1.16 {d28, d29}, [DST_W, :128]!
297 vld1.16 {d4, d5}, [DST_R, :128]!
299 pixman_composite_over_8888_0565_process_pixblock_head
306 * And now the final part. We are using 'generate_composite_function' macro
307 * to put all the stuff together. We are specifying the name of the function
308 * which we want to get, number of bits per pixel for the source, mask and
309 * destination (0 if unused, like mask in this case). Next come some bit
311 * FLAG_DST_READWRITE - tells that the destination buffer is both read
312 * and written, for write-only buffer we would use
313 * FLAG_DST_WRITEONLY flag instead
314 * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
315 * and separate color channels for 32bpp format.
316 * The next things are:
317 * - the number of pixels processed per iteration (8 in this case, because
318 * that's the maximum what can fit into four 64-bit NEON registers).
319 * - prefetch distance, measured in pixel blocks. In this case it is 5 times
320 * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
321 * prefetch distance can be selected by running some benchmarks.
323 * After that we specify some macros, these are 'default_init',
324 * 'default_cleanup' here which are empty (but it is possible to have custom
325 * init/cleanup macros to be able to save/restore some extra NEON registers
326 * like d8-d15 or do anything else) followed by
327 * 'pixman_composite_over_8888_0565_process_pixblock_head',
328 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
329 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
330 * which we got implemented above.
332 * The last part is the NEON registers allocation scheme.
334 generate_composite_function \
335 pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
336 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
337 8, /* number of pixels, processed in a single block */ \
338 5, /* prefetch distance */ \
341 pixman_composite_over_8888_0565_process_pixblock_head, \
342 pixman_composite_over_8888_0565_process_pixblock_tail, \
343 pixman_composite_over_8888_0565_process_pixblock_tail_head, \
344 28, /* dst_w_basereg */ \
345 4, /* dst_r_basereg */ \
346 0, /* src_basereg */ \
347 24 /* mask_basereg */
349 /******************************************************************************/
351 .macro pixman_composite_over_n_0565_process_pixblock_head
352 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
353 and put data into d6 - red, d7 - green, d30 - blue */
359 vshrn.u16 d30, q2, #2
360 /* now do alpha blending, storing results in 8-bit planar format
361 into d16 - red, d19 - green, d18 - blue */
364 vmull.u8 q12, d3, d30
365 vrshr.u16 q13, q10, #8
366 vrshr.u16 q3, q11, #8
367 vrshr.u16 q15, q12, #8
368 vraddhn.u16 d20, q10, q13
369 vraddhn.u16 d23, q11, q3
370 vraddhn.u16 d22, q12, q15
373 .macro pixman_composite_over_n_0565_process_pixblock_tail
374 /* ... continue alpha blending */
375 vqadd.u8 d16, d2, d20
377 /* convert the result to r5g6b5 and store it into {d28, d29} */
378 vshll.u8 q14, d16, #8
382 vsri.u16 q14, q9, #11
385 /* TODO: expand macros and do better instructions scheduling */
386 .macro pixman_composite_over_n_0565_process_pixblock_tail_head
387 pixman_composite_over_n_0565_process_pixblock_tail
388 vld1.16 {d4, d5}, [DST_R, :128]!
389 vst1.16 {d28, d29}, [DST_W, :128]!
390 pixman_composite_over_n_0565_process_pixblock_head
394 .macro pixman_composite_over_n_0565_init
395 add DUMMY, sp, #ARGS_STACK_OFFSET
396 vld1.32 {d3[0]}, [DUMMY]
401 vmvn.8 d3, d3 /* invert source alpha */
404 generate_composite_function \
405 pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
406 FLAG_DST_READWRITE, \
407 8, /* number of pixels, processed in a single block */ \
408 5, /* prefetch distance */ \
409 pixman_composite_over_n_0565_init, \
411 pixman_composite_over_n_0565_process_pixblock_head, \
412 pixman_composite_over_n_0565_process_pixblock_tail, \
413 pixman_composite_over_n_0565_process_pixblock_tail_head, \
414 28, /* dst_w_basereg */ \
415 4, /* dst_r_basereg */ \
416 0, /* src_basereg */ \
417 24 /* mask_basereg */
419 /******************************************************************************/
421 .macro pixman_composite_src_8888_0565_process_pixblock_head
427 .macro pixman_composite_src_8888_0565_process_pixblock_tail
429 vsri.u16 q14, q9, #11
432 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
434 PF add PF_X, PF_X, #8
437 PF addne PF_X, PF_X, #8
438 PF subne PF_CTL, PF_CTL, #1
439 vsri.u16 q14, q9, #11
441 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
443 vst1.16 {d28, d29}, [DST_W, :128]!
444 PF subge PF_X, PF_X, ORIG_W
445 PF subges PF_CTL, PF_CTL, #0x10
447 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
451 generate_composite_function \
452 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
453 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
454 8, /* number of pixels, processed in a single block */ \
455 10, /* prefetch distance */ \
458 pixman_composite_src_8888_0565_process_pixblock_head, \
459 pixman_composite_src_8888_0565_process_pixblock_tail, \
460 pixman_composite_src_8888_0565_process_pixblock_tail_head
462 /******************************************************************************/
464 .macro pixman_composite_src_0565_8888_process_pixblock_head
465 vshrn.u16 d30, q0, #8
466 vshrn.u16 d29, q0, #3
471 vshrn.u16 d28, q0, #2
474 .macro pixman_composite_src_0565_8888_process_pixblock_tail
477 /* TODO: expand macros and do better instructions scheduling */
478 .macro pixman_composite_src_0565_8888_process_pixblock_tail_head
479 pixman_composite_src_0565_8888_process_pixblock_tail
480 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
482 pixman_composite_src_0565_8888_process_pixblock_head
486 generate_composite_function \
487 pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
488 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
489 8, /* number of pixels, processed in a single block */ \
490 10, /* prefetch distance */ \
493 pixman_composite_src_0565_8888_process_pixblock_head, \
494 pixman_composite_src_0565_8888_process_pixblock_tail, \
495 pixman_composite_src_0565_8888_process_pixblock_tail_head
497 /******************************************************************************/
499 .macro pixman_composite_add_8_8_process_pixblock_head
504 .macro pixman_composite_add_8_8_process_pixblock_tail
507 .macro pixman_composite_add_8_8_process_pixblock_tail_head
509 PF add PF_X, PF_X, #32
511 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
512 PF addne PF_X, PF_X, #32
513 PF subne PF_CTL, PF_CTL, #1
514 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
516 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
517 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
518 PF subge PF_X, PF_X, ORIG_W
519 PF subges PF_CTL, PF_CTL, #0x10
521 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
522 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
526 generate_composite_function \
527 pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
528 FLAG_DST_READWRITE, \
529 32, /* number of pixels, processed in a single block */ \
530 10, /* prefetch distance */ \
533 pixman_composite_add_8_8_process_pixblock_head, \
534 pixman_composite_add_8_8_process_pixblock_tail, \
535 pixman_composite_add_8_8_process_pixblock_tail_head
537 /******************************************************************************/
539 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
541 PF add PF_X, PF_X, #8
543 vld1.32 {d4, d5, d6, d7}, [DST_R, :128]!
544 PF addne PF_X, PF_X, #8
545 PF subne PF_CTL, PF_CTL, #1
546 vst1.32 {d28, d29, d30, d31}, [DST_W, :128]!
548 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
549 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
550 PF subge PF_X, PF_X, ORIG_W
551 PF subges PF_CTL, PF_CTL, #0x10
553 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
554 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
558 generate_composite_function \
559 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
560 FLAG_DST_READWRITE, \
561 8, /* number of pixels, processed in a single block */ \
562 10, /* prefetch distance */ \
565 pixman_composite_add_8_8_process_pixblock_head, \
566 pixman_composite_add_8_8_process_pixblock_tail, \
567 pixman_composite_add_8888_8888_process_pixblock_tail_head
569 generate_composite_function_single_scanline \
570 pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
571 FLAG_DST_READWRITE, \
572 8, /* number of pixels, processed in a single block */ \
575 pixman_composite_add_8_8_process_pixblock_head, \
576 pixman_composite_add_8_8_process_pixblock_tail, \
577 pixman_composite_add_8888_8888_process_pixblock_tail_head
579 /******************************************************************************/
581 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
582 vmvn.8 d24, d3 /* get inverted alpha */
583 /* do alpha blending */
586 vmull.u8 q10, d24, d6
587 vmull.u8 q11, d24, d7
590 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
591 vrshr.u16 q14, q8, #8
592 vrshr.u16 q15, q9, #8
593 vrshr.u16 q12, q10, #8
594 vrshr.u16 q13, q11, #8
595 vraddhn.u16 d28, q14, q8
596 vraddhn.u16 d29, q15, q9
597 vraddhn.u16 d30, q12, q10
598 vraddhn.u16 d31, q13, q11
601 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
602 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
603 vrshr.u16 q14, q8, #8
604 PF add PF_X, PF_X, #8
606 vrshr.u16 q15, q9, #8
607 vrshr.u16 q12, q10, #8
608 vrshr.u16 q13, q11, #8
609 PF addne PF_X, PF_X, #8
610 PF subne PF_CTL, PF_CTL, #1
611 vraddhn.u16 d28, q14, q8
612 vraddhn.u16 d29, q15, q9
614 vraddhn.u16 d30, q12, q10
615 vraddhn.u16 d31, q13, q11
617 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
619 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
620 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
621 PF subge PF_X, PF_X, ORIG_W
623 PF subges PF_CTL, PF_CTL, #0x10
625 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
626 vmull.u8 q10, d22, d6
627 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
628 vmull.u8 q11, d22, d7
631 generate_composite_function_single_scanline \
632 pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
633 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
634 8, /* number of pixels, processed in a single block */ \
637 pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
638 pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
639 pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
641 /******************************************************************************/
643 .macro pixman_composite_over_8888_8888_process_pixblock_head
644 pixman_composite_out_reverse_8888_8888_process_pixblock_head
647 .macro pixman_composite_over_8888_8888_process_pixblock_tail
648 pixman_composite_out_reverse_8888_8888_process_pixblock_tail
649 vqadd.u8 q14, q0, q14
650 vqadd.u8 q15, q1, q15
653 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
654 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
655 vrshr.u16 q14, q8, #8
656 PF add PF_X, PF_X, #8
658 vrshr.u16 q15, q9, #8
659 vrshr.u16 q12, q10, #8
660 vrshr.u16 q13, q11, #8
661 PF addne PF_X, PF_X, #8
662 PF subne PF_CTL, PF_CTL, #1
663 vraddhn.u16 d28, q14, q8
664 vraddhn.u16 d29, q15, q9
666 vraddhn.u16 d30, q12, q10
667 vraddhn.u16 d31, q13, q11
668 vqadd.u8 q14, q0, q14
669 vqadd.u8 q15, q1, q15
671 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
673 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
674 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
675 PF subge PF_X, PF_X, ORIG_W
677 PF subges PF_CTL, PF_CTL, #0x10
679 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
680 vmull.u8 q10, d22, d6
681 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
682 vmull.u8 q11, d22, d7
685 generate_composite_function \
686 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
687 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
688 8, /* number of pixels, processed in a single block */ \
689 5, /* prefetch distance */ \
692 pixman_composite_over_8888_8888_process_pixblock_head, \
693 pixman_composite_over_8888_8888_process_pixblock_tail, \
694 pixman_composite_over_8888_8888_process_pixblock_tail_head
696 generate_composite_function_single_scanline \
697 pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
698 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
699 8, /* number of pixels, processed in a single block */ \
702 pixman_composite_over_8888_8888_process_pixblock_head, \
703 pixman_composite_over_8888_8888_process_pixblock_tail, \
704 pixman_composite_over_8888_8888_process_pixblock_tail_head
706 /******************************************************************************/
708 /* TODO: expand macros and do better instructions scheduling */
709 .macro pixman_composite_over_n_8888_process_pixblock_tail_head
710 pixman_composite_over_8888_8888_process_pixblock_tail
711 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
712 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
713 pixman_composite_over_8888_8888_process_pixblock_head
717 .macro pixman_composite_over_n_8888_init
718 add DUMMY, sp, #ARGS_STACK_OFFSET
719 vld1.32 {d3[0]}, [DUMMY]
726 generate_composite_function \
727 pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
728 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
729 8, /* number of pixels, processed in a single block */ \
730 5, /* prefetch distance */ \
731 pixman_composite_over_n_8888_init, \
733 pixman_composite_over_8888_8888_process_pixblock_head, \
734 pixman_composite_over_8888_8888_process_pixblock_tail, \
735 pixman_composite_over_n_8888_process_pixblock_tail_head
737 /******************************************************************************/
739 .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
740 vrshr.u16 q14, q8, #8
741 PF add PF_X, PF_X, #8
743 vrshr.u16 q15, q9, #8
744 vrshr.u16 q12, q10, #8
745 vrshr.u16 q13, q11, #8
746 PF addne PF_X, PF_X, #8
747 PF subne PF_CTL, PF_CTL, #1
748 vraddhn.u16 d28, q14, q8
749 vraddhn.u16 d29, q15, q9
751 vraddhn.u16 d30, q12, q10
752 vraddhn.u16 d31, q13, q11
753 vqadd.u8 q14, q0, q14
754 vqadd.u8 q15, q1, q15
755 vld4.8 {d0, d1, d2, d3}, [DST_R, :128]!
757 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
758 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
759 PF subge PF_X, PF_X, ORIG_W
761 PF subges PF_CTL, PF_CTL, #0x10
763 vmull.u8 q10, d22, d6
764 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
765 vmull.u8 q11, d22, d7
768 .macro pixman_composite_over_reverse_n_8888_init
769 add DUMMY, sp, #ARGS_STACK_OFFSET
770 vld1.32 {d7[0]}, [DUMMY]
777 generate_composite_function \
778 pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
779 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
780 8, /* number of pixels, processed in a single block */ \
781 5, /* prefetch distance */ \
782 pixman_composite_over_reverse_n_8888_init, \
784 pixman_composite_over_8888_8888_process_pixblock_head, \
785 pixman_composite_over_8888_8888_process_pixblock_tail, \
786 pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
787 28, /* dst_w_basereg */ \
788 0, /* dst_r_basereg */ \
789 4, /* src_basereg */ \
790 24 /* mask_basereg */
792 /******************************************************************************/
794 .macro pixman_composite_over_8888_8_0565_process_pixblock_head
795 vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */
797 vmull.u8 q6, d24, d10
798 vmull.u8 q7, d24, d11
799 vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */
802 vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */
804 vrshr.u16 q10, q6, #8
805 vrshr.u16 q11, q7, #8
806 vraddhn.u16 d0, q0, q8
807 vraddhn.u16 d1, q1, q9
808 vraddhn.u16 d2, q6, q10
809 vraddhn.u16 d3, q7, q11
810 vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */
813 vshrn.u16 d30, q2, #2
814 vmull.u8 q8, d3, d6 /* now do alpha blending */
816 vmull.u8 q10, d3, d30
819 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail
820 /* 3 cycle bubble (after vmull.u8) */
821 vrshr.u16 q13, q8, #8
822 vrshr.u16 q11, q9, #8
823 vrshr.u16 q15, q10, #8
824 vraddhn.u16 d16, q8, q13
825 vraddhn.u16 d27, q9, q11
826 vraddhn.u16 d26, q10, q15
827 vqadd.u8 d16, d2, d16
830 vshll.u8 q14, d16, #8 /* convert to 16bpp */
835 vsri.u16 q14, q9, #11
838 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
839 vld1.16 {d4, d5}, [DST_R, :128]!
844 vmull.u8 q6, d24, d10
845 vrshr.u16 q13, q8, #8
846 vrshr.u16 q11, q9, #8
847 vrshr.u16 q15, q10, #8
848 vraddhn.u16 d16, q8, q13
849 vraddhn.u16 d27, q9, q11
850 vraddhn.u16 d26, q10, q15
851 vqadd.u8 d16, d2, d16
854 vshll.u8 q14, d16, #8
859 vmull.u8 q7, d24, d11
860 vsri.u16 q14, q9, #11
867 vrshr.u16 q10, q6, #8
868 vrshr.u16 q11, q7, #8
869 vraddhn.u16 d0, q0, q8
870 vraddhn.u16 d1, q1, q9
871 vraddhn.u16 d2, q6, q10
872 vraddhn.u16 d3, q7, q11
876 vshrn.u16 d30, q2, #2
877 vst1.16 {d28, d29}, [DST_W, :128]!
880 vmull.u8 q10, d3, d30
883 generate_composite_function \
884 pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
885 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
886 8, /* number of pixels, processed in a single block */ \
887 5, /* prefetch distance */ \
888 default_init_need_all_regs, \
889 default_cleanup_need_all_regs, \
890 pixman_composite_over_8888_8_0565_process_pixblock_head, \
891 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
892 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
893 28, /* dst_w_basereg */ \
894 4, /* dst_r_basereg */ \
895 8, /* src_basereg */ \
896 24 /* mask_basereg */
898 /******************************************************************************/
901 * This function needs a special initialization of solid mask.
902 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
903 * offset, split into color components and replicated in d8-d11
904 * registers. Additionally, this function needs all the NEON registers,
905 * so it has to save d8-d15 registers which are callee saved according
906 * to ABI. These registers are restored from 'cleanup' macro. All the
907 * other NEON registers are caller saved, so can be clobbered freely
908 * without introducing any problems.
910 .macro pixman_composite_over_n_8_0565_init
911 add DUMMY, sp, #ARGS_STACK_OFFSET
913 vld1.32 {d11[0]}, [DUMMY]
920 .macro pixman_composite_over_n_8_0565_cleanup
924 generate_composite_function \
925 pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
926 FLAG_DST_READWRITE, \
927 8, /* number of pixels, processed in a single block */ \
928 5, /* prefetch distance */ \
929 pixman_composite_over_n_8_0565_init, \
930 pixman_composite_over_n_8_0565_cleanup, \
931 pixman_composite_over_8888_8_0565_process_pixblock_head, \
932 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
933 pixman_composite_over_8888_8_0565_process_pixblock_tail_head
935 /******************************************************************************/
937 .macro pixman_composite_over_8888_n_0565_init
938 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
940 vld1.32 {d24[0]}, [DUMMY]
944 .macro pixman_composite_over_8888_n_0565_cleanup
948 generate_composite_function \
949 pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
950 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
951 8, /* number of pixels, processed in a single block */ \
952 5, /* prefetch distance */ \
953 pixman_composite_over_8888_n_0565_init, \
954 pixman_composite_over_8888_n_0565_cleanup, \
955 pixman_composite_over_8888_8_0565_process_pixblock_head, \
956 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
957 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
958 28, /* dst_w_basereg */ \
959 4, /* dst_r_basereg */ \
960 8, /* src_basereg */ \
961 24 /* mask_basereg */
963 /******************************************************************************/
965 .macro pixman_composite_src_0565_0565_process_pixblock_head
968 .macro pixman_composite_src_0565_0565_process_pixblock_tail
971 .macro pixman_composite_src_0565_0565_process_pixblock_tail_head
972 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
977 generate_composite_function \
978 pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
979 FLAG_DST_WRITEONLY, \
980 16, /* number of pixels, processed in a single block */ \
981 10, /* prefetch distance */ \
984 pixman_composite_src_0565_0565_process_pixblock_head, \
985 pixman_composite_src_0565_0565_process_pixblock_tail, \
986 pixman_composite_src_0565_0565_process_pixblock_tail_head, \
987 0, /* dst_w_basereg */ \
988 0, /* dst_r_basereg */ \
989 0, /* src_basereg */ \
992 /******************************************************************************/
994 .macro pixman_composite_src_n_8_process_pixblock_head
997 .macro pixman_composite_src_n_8_process_pixblock_tail
1000 .macro pixman_composite_src_n_8_process_pixblock_tail_head
1001 vst1.8 {d0, d1, d2, d3}, [DST_W, :128]!
1004 .macro pixman_composite_src_n_8_init
1005 add DUMMY, sp, #ARGS_STACK_OFFSET
1006 vld1.32 {d0[0]}, [DUMMY]
1008 vsli.u64 d0, d0, #16
1009 vsli.u64 d0, d0, #32
1014 .macro pixman_composite_src_n_8_cleanup
1017 generate_composite_function \
1018 pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
1019 FLAG_DST_WRITEONLY, \
1020 32, /* number of pixels, processed in a single block */ \
1021 0, /* prefetch distance */ \
1022 pixman_composite_src_n_8_init, \
1023 pixman_composite_src_n_8_cleanup, \
1024 pixman_composite_src_n_8_process_pixblock_head, \
1025 pixman_composite_src_n_8_process_pixblock_tail, \
1026 pixman_composite_src_n_8_process_pixblock_tail_head, \
1027 0, /* dst_w_basereg */ \
1028 0, /* dst_r_basereg */ \
1029 0, /* src_basereg */ \
1030 0 /* mask_basereg */
1032 /******************************************************************************/
1034 .macro pixman_composite_src_n_0565_process_pixblock_head
1037 .macro pixman_composite_src_n_0565_process_pixblock_tail
1040 .macro pixman_composite_src_n_0565_process_pixblock_tail_head
1041 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
1044 .macro pixman_composite_src_n_0565_init
1045 add DUMMY, sp, #ARGS_STACK_OFFSET
1046 vld1.32 {d0[0]}, [DUMMY]
1047 vsli.u64 d0, d0, #16
1048 vsli.u64 d0, d0, #32
1053 .macro pixman_composite_src_n_0565_cleanup
1056 generate_composite_function \
1057 pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
1058 FLAG_DST_WRITEONLY, \
1059 16, /* number of pixels, processed in a single block */ \
1060 0, /* prefetch distance */ \
1061 pixman_composite_src_n_0565_init, \
1062 pixman_composite_src_n_0565_cleanup, \
1063 pixman_composite_src_n_0565_process_pixblock_head, \
1064 pixman_composite_src_n_0565_process_pixblock_tail, \
1065 pixman_composite_src_n_0565_process_pixblock_tail_head, \
1066 0, /* dst_w_basereg */ \
1067 0, /* dst_r_basereg */ \
1068 0, /* src_basereg */ \
1069 0 /* mask_basereg */
1071 /******************************************************************************/
1073 .macro pixman_composite_src_n_8888_process_pixblock_head
1076 .macro pixman_composite_src_n_8888_process_pixblock_tail
1079 .macro pixman_composite_src_n_8888_process_pixblock_tail_head
1080 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1083 .macro pixman_composite_src_n_8888_init
1084 add DUMMY, sp, #ARGS_STACK_OFFSET
1085 vld1.32 {d0[0]}, [DUMMY]
1086 vsli.u64 d0, d0, #32
1091 .macro pixman_composite_src_n_8888_cleanup
1094 generate_composite_function \
1095 pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
1096 FLAG_DST_WRITEONLY, \
1097 8, /* number of pixels, processed in a single block */ \
1098 0, /* prefetch distance */ \
1099 pixman_composite_src_n_8888_init, \
1100 pixman_composite_src_n_8888_cleanup, \
1101 pixman_composite_src_n_8888_process_pixblock_head, \
1102 pixman_composite_src_n_8888_process_pixblock_tail, \
1103 pixman_composite_src_n_8888_process_pixblock_tail_head, \
1104 0, /* dst_w_basereg */ \
1105 0, /* dst_r_basereg */ \
1106 0, /* src_basereg */ \
1107 0 /* mask_basereg */
1109 /******************************************************************************/
1111 .macro pixman_composite_src_8888_8888_process_pixblock_head
1114 .macro pixman_composite_src_8888_8888_process_pixblock_tail
1117 .macro pixman_composite_src_8888_8888_process_pixblock_tail_head
1118 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1123 generate_composite_function \
1124 pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
1125 FLAG_DST_WRITEONLY, \
1126 8, /* number of pixels, processed in a single block */ \
1127 10, /* prefetch distance */ \
1130 pixman_composite_src_8888_8888_process_pixblock_head, \
1131 pixman_composite_src_8888_8888_process_pixblock_tail, \
1132 pixman_composite_src_8888_8888_process_pixblock_tail_head, \
1133 0, /* dst_w_basereg */ \
1134 0, /* dst_r_basereg */ \
1135 0, /* src_basereg */ \
1136 0 /* mask_basereg */
1138 /******************************************************************************/
1140 .macro pixman_composite_src_x888_8888_process_pixblock_head
1145 .macro pixman_composite_src_x888_8888_process_pixblock_tail
1148 .macro pixman_composite_src_x888_8888_process_pixblock_tail_head
1149 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1156 .macro pixman_composite_src_x888_8888_init
1158 vshl.u32 q2, q2, #24
1161 generate_composite_function \
1162 pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
1163 FLAG_DST_WRITEONLY, \
1164 8, /* number of pixels, processed in a single block */ \
1165 10, /* prefetch distance */ \
1166 pixman_composite_src_x888_8888_init, \
1168 pixman_composite_src_x888_8888_process_pixblock_head, \
1169 pixman_composite_src_x888_8888_process_pixblock_tail, \
1170 pixman_composite_src_x888_8888_process_pixblock_tail_head, \
1171 0, /* dst_w_basereg */ \
1172 0, /* dst_r_basereg */ \
1173 0, /* src_basereg */ \
1174 0 /* mask_basereg */
1176 /******************************************************************************/
1178 .macro pixman_composite_over_n_8_8888_process_pixblock_head
1179 /* expecting deinterleaved source data in {d8, d9, d10, d11} */
1180 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1181 /* and destination data in {d4, d5, d6, d7} */
1182 /* mask is in d24 (d25, d26, d27 are unused) */
1185 vmull.u8 q0, d24, d8
1186 vmull.u8 q1, d24, d9
1187 vmull.u8 q6, d24, d10
1188 vmull.u8 q7, d24, d11
1189 vrshr.u16 q10, q0, #8
1190 vrshr.u16 q11, q1, #8
1191 vrshr.u16 q12, q6, #8
1192 vrshr.u16 q13, q7, #8
1193 vraddhn.u16 d0, q0, q10
1194 vraddhn.u16 d1, q1, q11
1195 vraddhn.u16 d2, q6, q12
1196 vraddhn.u16 d3, q7, q13
1197 vmvn.8 d24, d3 /* get inverted alpha */
1198 /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */
1199 /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
1200 /* now do alpha blending */
1201 vmull.u8 q8, d24, d4
1202 vmull.u8 q9, d24, d5
1203 vmull.u8 q10, d24, d6
1204 vmull.u8 q11, d24, d7
1207 .macro pixman_composite_over_n_8_8888_process_pixblock_tail
1208 vrshr.u16 q14, q8, #8
1209 vrshr.u16 q15, q9, #8
1210 vrshr.u16 q12, q10, #8
1211 vrshr.u16 q13, q11, #8
1212 vraddhn.u16 d28, q14, q8
1213 vraddhn.u16 d29, q15, q9
1214 vraddhn.u16 d30, q12, q10
1215 vraddhn.u16 d31, q13, q11
1216 vqadd.u8 q14, q0, q14
1217 vqadd.u8 q15, q1, q15
1220 /* TODO: expand macros and do better instructions scheduling */
1221 .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
1222 pixman_composite_over_n_8_8888_process_pixblock_tail
1223 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1224 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1227 pixman_composite_over_n_8_8888_process_pixblock_head
1230 .macro pixman_composite_over_n_8_8888_init
1231 add DUMMY, sp, #ARGS_STACK_OFFSET
1233 vld1.32 {d11[0]}, [DUMMY]
1240 .macro pixman_composite_over_n_8_8888_cleanup
1244 generate_composite_function \
1245 pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
1246 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1247 8, /* number of pixels, processed in a single block */ \
1248 5, /* prefetch distance */ \
1249 pixman_composite_over_n_8_8888_init, \
1250 pixman_composite_over_n_8_8888_cleanup, \
1251 pixman_composite_over_n_8_8888_process_pixblock_head, \
1252 pixman_composite_over_n_8_8888_process_pixblock_tail, \
1253 pixman_composite_over_n_8_8888_process_pixblock_tail_head
1255 /******************************************************************************/
1257 .macro pixman_composite_over_n_8_8_process_pixblock_head
1258 vmull.u8 q0, d24, d8
1259 vmull.u8 q1, d25, d8
1260 vmull.u8 q6, d26, d8
1261 vmull.u8 q7, d27, d8
1262 vrshr.u16 q10, q0, #8
1263 vrshr.u16 q11, q1, #8
1264 vrshr.u16 q12, q6, #8
1265 vrshr.u16 q13, q7, #8
1266 vraddhn.u16 d0, q0, q10
1267 vraddhn.u16 d1, q1, q11
1268 vraddhn.u16 d2, q6, q12
1269 vraddhn.u16 d3, q7, q13
1272 vmull.u8 q8, d24, d4
1273 vmull.u8 q9, d25, d5
1274 vmull.u8 q10, d26, d6
1275 vmull.u8 q11, d27, d7
1278 .macro pixman_composite_over_n_8_8_process_pixblock_tail
1279 vrshr.u16 q14, q8, #8
1280 vrshr.u16 q15, q9, #8
1281 vrshr.u16 q12, q10, #8
1282 vrshr.u16 q13, q11, #8
1283 vraddhn.u16 d28, q14, q8
1284 vraddhn.u16 d29, q15, q9
1285 vraddhn.u16 d30, q12, q10
1286 vraddhn.u16 d31, q13, q11
1287 vqadd.u8 q14, q0, q14
1288 vqadd.u8 q15, q1, q15
1291 /* TODO: expand macros and do better instructions scheduling */
1292 .macro pixman_composite_over_n_8_8_process_pixblock_tail_head
1293 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1294 pixman_composite_over_n_8_8_process_pixblock_tail
1296 cache_preload 32, 32
1297 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1298 pixman_composite_over_n_8_8_process_pixblock_head
1301 .macro pixman_composite_over_n_8_8_init
1302 add DUMMY, sp, #ARGS_STACK_OFFSET
1304 vld1.32 {d8[0]}, [DUMMY]
1308 .macro pixman_composite_over_n_8_8_cleanup
1312 generate_composite_function \
1313 pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
1314 FLAG_DST_READWRITE, \
1315 32, /* number of pixels, processed in a single block */ \
1316 5, /* prefetch distance */ \
1317 pixman_composite_over_n_8_8_init, \
1318 pixman_composite_over_n_8_8_cleanup, \
1319 pixman_composite_over_n_8_8_process_pixblock_head, \
1320 pixman_composite_over_n_8_8_process_pixblock_tail, \
1321 pixman_composite_over_n_8_8_process_pixblock_tail_head
1323 /******************************************************************************/
1325 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1327 * 'combine_mask_ca' replacement
1329 * input: solid src (n) in {d8, d9, d10, d11}
1330 * dest in {d4, d5, d6, d7 }
1331 * mask in {d24, d25, d26, d27}
1332 * output: updated src in {d0, d1, d2, d3 }
1333 * updated mask in {d24, d25, d26, d3 }
1335 vmull.u8 q0, d24, d8
1336 vmull.u8 q1, d25, d9
1337 vmull.u8 q6, d26, d10
1338 vmull.u8 q7, d27, d11
1339 vmull.u8 q9, d11, d25
1340 vmull.u8 q12, d11, d24
1341 vmull.u8 q13, d11, d26
1342 vrshr.u16 q8, q0, #8
1343 vrshr.u16 q10, q1, #8
1344 vrshr.u16 q11, q6, #8
1345 vraddhn.u16 d0, q0, q8
1346 vraddhn.u16 d1, q1, q10
1347 vraddhn.u16 d2, q6, q11
1348 vrshr.u16 q11, q12, #8
1349 vrshr.u16 q8, q9, #8
1350 vrshr.u16 q6, q13, #8
1351 vrshr.u16 q10, q7, #8
1352 vraddhn.u16 d24, q12, q11
1353 vraddhn.u16 d25, q9, q8
1354 vraddhn.u16 d26, q13, q6
1355 vraddhn.u16 d3, q7, q10
1357 * 'combine_over_ca' replacement
1359 * output: updated dest in {d28, d29, d30, d31}
1363 vmull.u8 q8, d24, d4
1364 vmull.u8 q9, d25, d5
1367 vmull.u8 q10, d26, d6
1368 vmull.u8 q11, d27, d7
1371 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
1372 /* ... continue 'combine_over_ca' replacement */
1373 vrshr.u16 q14, q8, #8
1374 vrshr.u16 q15, q9, #8
1375 vrshr.u16 q6, q10, #8
1376 vrshr.u16 q7, q11, #8
1377 vraddhn.u16 d28, q14, q8
1378 vraddhn.u16 d29, q15, q9
1379 vraddhn.u16 d30, q6, q10
1380 vraddhn.u16 d31, q7, q11
1381 vqadd.u8 q14, q0, q14
1382 vqadd.u8 q15, q1, q15
1385 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1386 vrshr.u16 q14, q8, #8
1387 vrshr.u16 q15, q9, #8
1388 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1389 vrshr.u16 q6, q10, #8
1390 vrshr.u16 q7, q11, #8
1391 vraddhn.u16 d28, q14, q8
1392 vraddhn.u16 d29, q15, q9
1393 vraddhn.u16 d30, q6, q10
1394 vraddhn.u16 d31, q7, q11
1396 vqadd.u8 q14, q0, q14
1397 vqadd.u8 q15, q1, q15
1399 pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1400 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1403 .macro pixman_composite_over_n_8888_8888_ca_init
1404 add DUMMY, sp, #ARGS_STACK_OFFSET
1406 vld1.32 {d11[0]}, [DUMMY]
1413 .macro pixman_composite_over_n_8888_8888_ca_cleanup
1417 generate_composite_function \
1418 pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
1419 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1420 8, /* number of pixels, processed in a single block */ \
1421 5, /* prefetch distance */ \
1422 pixman_composite_over_n_8888_8888_ca_init, \
1423 pixman_composite_over_n_8888_8888_ca_cleanup, \
1424 pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
1425 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
1426 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1428 /******************************************************************************/
1430 .macro pixman_composite_in_n_8_process_pixblock_head
1431 /* expecting source data in {d0, d1, d2, d3} */
1432 /* and destination data in {d4, d5, d6, d7} */
1435 vmull.u8 q10, d6, d3
1436 vmull.u8 q11, d7, d3
1439 .macro pixman_composite_in_n_8_process_pixblock_tail
1440 vrshr.u16 q14, q8, #8
1441 vrshr.u16 q15, q9, #8
1442 vrshr.u16 q12, q10, #8
1443 vrshr.u16 q13, q11, #8
1444 vraddhn.u16 d28, q8, q14
1445 vraddhn.u16 d29, q9, q15
1446 vraddhn.u16 d30, q10, q12
1447 vraddhn.u16 d31, q11, q13
1450 .macro pixman_composite_in_n_8_process_pixblock_tail_head
1451 pixman_composite_in_n_8_process_pixblock_tail
1452 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1453 cache_preload 32, 32
1454 pixman_composite_in_n_8_process_pixblock_head
1455 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1458 .macro pixman_composite_in_n_8_init
1459 add DUMMY, sp, #ARGS_STACK_OFFSET
1460 vld1.32 {d3[0]}, [DUMMY]
1464 .macro pixman_composite_in_n_8_cleanup
1467 generate_composite_function \
1468 pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
1469 FLAG_DST_READWRITE, \
1470 32, /* number of pixels, processed in a single block */ \
1471 5, /* prefetch distance */ \
1472 pixman_composite_in_n_8_init, \
1473 pixman_composite_in_n_8_cleanup, \
1474 pixman_composite_in_n_8_process_pixblock_head, \
1475 pixman_composite_in_n_8_process_pixblock_tail, \
1476 pixman_composite_in_n_8_process_pixblock_tail_head, \
1477 28, /* dst_w_basereg */ \
1478 4, /* dst_r_basereg */ \
1479 0, /* src_basereg */ \
1480 24 /* mask_basereg */
1482 .macro pixman_composite_add_n_8_8_process_pixblock_head
1483 /* expecting source data in {d8, d9, d10, d11} */
1484 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1485 /* and destination data in {d4, d5, d6, d7} */
1486 /* mask is in d24, d25, d26, d27 */
1487 vmull.u8 q0, d24, d11
1488 vmull.u8 q1, d25, d11
1489 vmull.u8 q6, d26, d11
1490 vmull.u8 q7, d27, d11
1491 vrshr.u16 q10, q0, #8
1492 vrshr.u16 q11, q1, #8
1493 vrshr.u16 q12, q6, #8
1494 vrshr.u16 q13, q7, #8
1495 vraddhn.u16 d0, q0, q10
1496 vraddhn.u16 d1, q1, q11
1497 vraddhn.u16 d2, q6, q12
1498 vraddhn.u16 d3, q7, q13
1499 vqadd.u8 q14, q0, q2
1500 vqadd.u8 q15, q1, q3
1503 .macro pixman_composite_add_n_8_8_process_pixblock_tail
1506 /* TODO: expand macros and do better instructions scheduling */
1507 .macro pixman_composite_add_n_8_8_process_pixblock_tail_head
1508 pixman_composite_add_n_8_8_process_pixblock_tail
1509 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1510 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1512 cache_preload 32, 32
1513 pixman_composite_add_n_8_8_process_pixblock_head
1516 .macro pixman_composite_add_n_8_8_init
1517 add DUMMY, sp, #ARGS_STACK_OFFSET
1519 vld1.32 {d11[0]}, [DUMMY]
1523 .macro pixman_composite_add_n_8_8_cleanup
1527 generate_composite_function \
1528 pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
1529 FLAG_DST_READWRITE, \
1530 32, /* number of pixels, processed in a single block */ \
1531 5, /* prefetch distance */ \
1532 pixman_composite_add_n_8_8_init, \
1533 pixman_composite_add_n_8_8_cleanup, \
1534 pixman_composite_add_n_8_8_process_pixblock_head, \
1535 pixman_composite_add_n_8_8_process_pixblock_tail, \
1536 pixman_composite_add_n_8_8_process_pixblock_tail_head
1538 /******************************************************************************/
1540 .macro pixman_composite_add_8_8_8_process_pixblock_head
1541 /* expecting source data in {d0, d1, d2, d3} */
1542 /* destination data in {d4, d5, d6, d7} */
1543 /* mask in {d24, d25, d26, d27} */
1544 vmull.u8 q8, d24, d0
1545 vmull.u8 q9, d25, d1
1546 vmull.u8 q10, d26, d2
1547 vmull.u8 q11, d27, d3
1548 vrshr.u16 q0, q8, #8
1549 vrshr.u16 q1, q9, #8
1550 vrshr.u16 q12, q10, #8
1551 vrshr.u16 q13, q11, #8
1552 vraddhn.u16 d0, q0, q8
1553 vraddhn.u16 d1, q1, q9
1554 vraddhn.u16 d2, q12, q10
1555 vraddhn.u16 d3, q13, q11
1556 vqadd.u8 q14, q0, q2
1557 vqadd.u8 q15, q1, q3
1560 .macro pixman_composite_add_8_8_8_process_pixblock_tail
1563 /* TODO: expand macros and do better instructions scheduling */
1564 .macro pixman_composite_add_8_8_8_process_pixblock_tail_head
1565 pixman_composite_add_8_8_8_process_pixblock_tail
1566 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1567 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1570 cache_preload 32, 32
1571 pixman_composite_add_8_8_8_process_pixblock_head
1574 .macro pixman_composite_add_8_8_8_init
1577 .macro pixman_composite_add_8_8_8_cleanup
1580 generate_composite_function \
1581 pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
1582 FLAG_DST_READWRITE, \
1583 32, /* number of pixels, processed in a single block */ \
1584 5, /* prefetch distance */ \
1585 pixman_composite_add_8_8_8_init, \
1586 pixman_composite_add_8_8_8_cleanup, \
1587 pixman_composite_add_8_8_8_process_pixblock_head, \
1588 pixman_composite_add_8_8_8_process_pixblock_tail, \
1589 pixman_composite_add_8_8_8_process_pixblock_tail_head
1591 /******************************************************************************/
1593 .macro pixman_composite_add_8888_8888_8888_process_pixblock_head
1594 /* expecting source data in {d0, d1, d2, d3} */
1595 /* destination data in {d4, d5, d6, d7} */
1596 /* mask in {d24, d25, d26, d27} */
1597 vmull.u8 q8, d27, d0
1598 vmull.u8 q9, d27, d1
1599 vmull.u8 q10, d27, d2
1600 vmull.u8 q11, d27, d3
1601 /* 1 cycle bubble */
1602 vrsra.u16 q8, q8, #8
1603 vrsra.u16 q9, q9, #8
1604 vrsra.u16 q10, q10, #8
1605 vrsra.u16 q11, q11, #8
1608 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
1609 /* 2 cycle bubble */
1610 vrshrn.u16 d28, q8, #8
1611 vrshrn.u16 d29, q9, #8
1612 vrshrn.u16 d30, q10, #8
1613 vrshrn.u16 d31, q11, #8
1614 vqadd.u8 q14, q2, q14
1615 /* 1 cycle bubble */
1616 vqadd.u8 q15, q3, q15
1619 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1621 vrshrn.u16 d28, q8, #8
1623 vrshrn.u16 d29, q9, #8
1624 vmull.u8 q8, d27, d0
1625 vrshrn.u16 d30, q10, #8
1626 vmull.u8 q9, d27, d1
1627 vrshrn.u16 d31, q11, #8
1628 vmull.u8 q10, d27, d2
1629 vqadd.u8 q14, q2, q14
1630 vmull.u8 q11, d27, d3
1631 vqadd.u8 q15, q3, q15
1632 vrsra.u16 q8, q8, #8
1633 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1634 vrsra.u16 q9, q9, #8
1635 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1636 vrsra.u16 q10, q10, #8
1640 vrsra.u16 q11, q11, #8
1643 generate_composite_function \
1644 pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
1645 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1646 8, /* number of pixels, processed in a single block */ \
1647 10, /* prefetch distance */ \
1650 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1651 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1652 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1654 generate_composite_function_single_scanline \
1655 pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
1656 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1657 8, /* number of pixels, processed in a single block */ \
1660 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1661 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1662 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1664 /******************************************************************************/
1666 generate_composite_function \
1667 pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
1668 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1669 8, /* number of pixels, processed in a single block */ \
1670 5, /* prefetch distance */ \
1673 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1674 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1675 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
1676 28, /* dst_w_basereg */ \
1677 4, /* dst_r_basereg */ \
1678 0, /* src_basereg */ \
1679 27 /* mask_basereg */
1681 /******************************************************************************/
1683 .macro pixman_composite_add_n_8_8888_init
1684 add DUMMY, sp, #ARGS_STACK_OFFSET
1685 vld1.32 {d3[0]}, [DUMMY]
1692 .macro pixman_composite_add_n_8_8888_cleanup
1695 generate_composite_function \
1696 pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
1697 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1698 8, /* number of pixels, processed in a single block */ \
1699 5, /* prefetch distance */ \
1700 pixman_composite_add_n_8_8888_init, \
1701 pixman_composite_add_n_8_8888_cleanup, \
1702 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1703 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1704 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
1705 28, /* dst_w_basereg */ \
1706 4, /* dst_r_basereg */ \
1707 0, /* src_basereg */ \
1708 27 /* mask_basereg */
1710 /******************************************************************************/
1712 .macro pixman_composite_add_8888_n_8888_init
1713 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
1714 vld1.32 {d27[0]}, [DUMMY]
1718 .macro pixman_composite_add_8888_n_8888_cleanup
1721 generate_composite_function \
1722 pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
1723 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1724 8, /* number of pixels, processed in a single block */ \
1725 5, /* prefetch distance */ \
1726 pixman_composite_add_8888_n_8888_init, \
1727 pixman_composite_add_8888_n_8888_cleanup, \
1728 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1729 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1730 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
1731 28, /* dst_w_basereg */ \
1732 4, /* dst_r_basereg */ \
1733 0, /* src_basereg */ \
1734 27 /* mask_basereg */
1736 /******************************************************************************/
1738 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
1739 /* expecting source data in {d0, d1, d2, d3} */
1740 /* destination data in {d4, d5, d6, d7} */
1741 /* solid mask is in d15 */
1744 vmull.u8 q8, d15, d3
1745 vmull.u8 q6, d15, d2
1746 vmull.u8 q5, d15, d1
1747 vmull.u8 q4, d15, d0
1748 vrshr.u16 q13, q8, #8
1749 vrshr.u16 q12, q6, #8
1750 vrshr.u16 q11, q5, #8
1751 vrshr.u16 q10, q4, #8
1752 vraddhn.u16 d3, q8, q13
1753 vraddhn.u16 d2, q6, q12
1754 vraddhn.u16 d1, q5, q11
1755 vraddhn.u16 d0, q4, q10
1756 vmvn.8 d24, d3 /* get inverted alpha */
1757 /* now do alpha blending */
1758 vmull.u8 q8, d24, d4
1759 vmull.u8 q9, d24, d5
1760 vmull.u8 q10, d24, d6
1761 vmull.u8 q11, d24, d7
1764 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
1765 vrshr.u16 q14, q8, #8
1766 vrshr.u16 q15, q9, #8
1767 vrshr.u16 q12, q10, #8
1768 vrshr.u16 q13, q11, #8
1769 vraddhn.u16 d28, q14, q8
1770 vraddhn.u16 d29, q15, q9
1771 vraddhn.u16 d30, q12, q10
1772 vraddhn.u16 d31, q13, q11
1775 /* TODO: expand macros and do better instructions scheduling */
1776 .macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
1777 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1778 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
1782 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
1783 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1786 generate_composite_function_single_scanline \
1787 pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
1788 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1789 8, /* number of pixels, processed in a single block */ \
1790 default_init_need_all_regs, \
1791 default_cleanup_need_all_regs, \
1792 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
1793 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
1794 pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
1795 28, /* dst_w_basereg */ \
1796 4, /* dst_r_basereg */ \
1797 0, /* src_basereg */ \
1798 12 /* mask_basereg */
1800 /******************************************************************************/
1802 .macro pixman_composite_over_8888_n_8888_process_pixblock_head
1803 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
1806 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail
1807 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
1808 vqadd.u8 q14, q0, q14
1809 vqadd.u8 q15, q1, q15
1812 /* TODO: expand macros and do better instructions scheduling */
1813 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
1814 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1815 pixman_composite_over_8888_n_8888_process_pixblock_tail
1818 pixman_composite_over_8888_n_8888_process_pixblock_head
1819 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1822 .macro pixman_composite_over_8888_n_8888_init
1825 vld1.32 {d15[0]}, [DUMMY]
1829 .macro pixman_composite_over_8888_n_8888_cleanup
1833 generate_composite_function \
1834 pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
1835 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1836 8, /* number of pixels, processed in a single block */ \
1837 5, /* prefetch distance */ \
1838 pixman_composite_over_8888_n_8888_init, \
1839 pixman_composite_over_8888_n_8888_cleanup, \
1840 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1841 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1842 pixman_composite_over_8888_n_8888_process_pixblock_tail_head
1844 /******************************************************************************/
1846 /* TODO: expand macros and do better instructions scheduling */
1847 .macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
1848 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1849 pixman_composite_over_8888_n_8888_process_pixblock_tail
1853 pixman_composite_over_8888_n_8888_process_pixblock_head
1854 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1857 generate_composite_function \
1858 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
1859 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1860 8, /* number of pixels, processed in a single block */ \
1861 5, /* prefetch distance */ \
1862 default_init_need_all_regs, \
1863 default_cleanup_need_all_regs, \
1864 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1865 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1866 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
1867 28, /* dst_w_basereg */ \
1868 4, /* dst_r_basereg */ \
1869 0, /* src_basereg */ \
1870 12 /* mask_basereg */
1872 generate_composite_function_single_scanline \
1873 pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
1874 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1875 8, /* number of pixels, processed in a single block */ \
1876 default_init_need_all_regs, \
1877 default_cleanup_need_all_regs, \
1878 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1879 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1880 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
1881 28, /* dst_w_basereg */ \
1882 4, /* dst_r_basereg */ \
1883 0, /* src_basereg */ \
1884 12 /* mask_basereg */
1886 /******************************************************************************/
1888 /* TODO: expand macros and do better instructions scheduling */
1889 .macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
1890 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1891 pixman_composite_over_8888_n_8888_process_pixblock_tail
1895 pixman_composite_over_8888_n_8888_process_pixblock_head
1896 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1899 generate_composite_function \
1900 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
1901 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1902 8, /* number of pixels, processed in a single block */ \
1903 5, /* prefetch distance */ \
1904 default_init_need_all_regs, \
1905 default_cleanup_need_all_regs, \
1906 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1907 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1908 pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
1909 28, /* dst_w_basereg */ \
1910 4, /* dst_r_basereg */ \
1911 0, /* src_basereg */ \
1912 15 /* mask_basereg */
1914 /******************************************************************************/
1916 .macro pixman_composite_src_0888_0888_process_pixblock_head
1919 .macro pixman_composite_src_0888_0888_process_pixblock_tail
1922 .macro pixman_composite_src_0888_0888_process_pixblock_tail_head
1923 vst3.8 {d0, d1, d2}, [DST_W]!
1928 generate_composite_function \
1929 pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
1930 FLAG_DST_WRITEONLY, \
1931 8, /* number of pixels, processed in a single block */ \
1932 10, /* prefetch distance */ \
1935 pixman_composite_src_0888_0888_process_pixblock_head, \
1936 pixman_composite_src_0888_0888_process_pixblock_tail, \
1937 pixman_composite_src_0888_0888_process_pixblock_tail_head, \
1938 0, /* dst_w_basereg */ \
1939 0, /* dst_r_basereg */ \
1940 0, /* src_basereg */ \
1941 0 /* mask_basereg */
1943 /******************************************************************************/
1945 .macro pixman_composite_src_0888_8888_rev_process_pixblock_head
1949 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
1952 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
1953 vst4.8 {d0, d1, d2, d3}, [DST_W]!
1959 .macro pixman_composite_src_0888_8888_rev_init
1963 generate_composite_function \
1964 pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
1965 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1966 8, /* number of pixels, processed in a single block */ \
1967 10, /* prefetch distance */ \
1968 pixman_composite_src_0888_8888_rev_init, \
1970 pixman_composite_src_0888_8888_rev_process_pixblock_head, \
1971 pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
1972 pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
1973 0, /* dst_w_basereg */ \
1974 0, /* dst_r_basereg */ \
1975 0, /* src_basereg */ \
1976 0 /* mask_basereg */
1978 /******************************************************************************/
1980 .macro pixman_composite_src_0888_0565_rev_process_pixblock_head
1985 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
1986 vshll.u8 q14, d0, #8
1987 vsri.u16 q14, q8, #5
1988 vsri.u16 q14, q9, #11
1991 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
1992 vshll.u8 q14, d0, #8
1994 vsri.u16 q14, q8, #5
1995 vsri.u16 q14, q9, #11
1997 vst1.16 {d28, d29}, [DST_W, :128]!
2001 generate_composite_function \
2002 pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
2003 FLAG_DST_WRITEONLY, \
2004 8, /* number of pixels, processed in a single block */ \
2005 10, /* prefetch distance */ \
2008 pixman_composite_src_0888_0565_rev_process_pixblock_head, \
2009 pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
2010 pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
2011 28, /* dst_w_basereg */ \
2012 0, /* dst_r_basereg */ \
2013 0, /* src_basereg */ \
2014 0 /* mask_basereg */
2016 /******************************************************************************/
2018 .macro pixman_composite_src_pixbuf_8888_process_pixblock_head
2021 vmull.u8 q10, d3, d2
2024 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
2025 vrshr.u16 q11, q8, #8
2027 vrshr.u16 q12, q9, #8
2028 vrshr.u16 q13, q10, #8
2029 vraddhn.u16 d30, q11, q8
2030 vraddhn.u16 d29, q12, q9
2031 vraddhn.u16 d28, q13, q10
2034 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
2035 vrshr.u16 q11, q8, #8
2037 vrshr.u16 q12, q9, #8
2038 vrshr.u16 q13, q10, #8
2040 vraddhn.u16 d30, q11, q8
2041 PF add PF_X, PF_X, #8
2043 PF addne PF_X, PF_X, #8
2044 PF subne PF_CTL, PF_CTL, #1
2045 vraddhn.u16 d29, q12, q9
2046 vraddhn.u16 d28, q13, q10
2049 vmull.u8 q10, d3, d2
2050 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2052 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
2053 PF subge PF_X, PF_X, ORIG_W
2054 PF subges PF_CTL, PF_CTL, #0x10
2055 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2058 generate_composite_function \
2059 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
2060 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2061 8, /* number of pixels, processed in a single block */ \
2062 10, /* prefetch distance */ \
2065 pixman_composite_src_pixbuf_8888_process_pixblock_head, \
2066 pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
2067 pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
2068 28, /* dst_w_basereg */ \
2069 0, /* dst_r_basereg */ \
2070 0, /* src_basereg */ \
2071 0 /* mask_basereg */
2073 /******************************************************************************/
2075 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
2078 vmull.u8 q10, d3, d2
2081 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
2082 vrshr.u16 q11, q8, #8
2084 vrshr.u16 q12, q9, #8
2085 vrshr.u16 q13, q10, #8
2086 vraddhn.u16 d28, q11, q8
2087 vraddhn.u16 d29, q12, q9
2088 vraddhn.u16 d30, q13, q10
2091 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
2092 vrshr.u16 q11, q8, #8
2094 vrshr.u16 q12, q9, #8
2095 vrshr.u16 q13, q10, #8
2097 vraddhn.u16 d28, q11, q8
2098 PF add PF_X, PF_X, #8
2100 PF addne PF_X, PF_X, #8
2101 PF subne PF_CTL, PF_CTL, #1
2102 vraddhn.u16 d29, q12, q9
2103 vraddhn.u16 d30, q13, q10
2106 vmull.u8 q10, d3, d2
2107 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2109 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
2110 PF subge PF_X, PF_X, ORIG_W
2111 PF subges PF_CTL, PF_CTL, #0x10
2112 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2115 generate_composite_function \
2116 pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
2117 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2118 8, /* number of pixels, processed in a single block */ \
2119 10, /* prefetch distance */ \
2122 pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
2123 pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
2124 pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
2125 28, /* dst_w_basereg */ \
2126 0, /* dst_r_basereg */ \
2127 0, /* src_basereg */ \
2128 0 /* mask_basereg */
2130 /******************************************************************************/
2132 .macro pixman_composite_over_0565_8_0565_process_pixblock_head
2133 /* mask is in d15 */
2134 convert_0565_to_x888 q4, d2, d1, d0
2135 convert_0565_to_x888 q5, d6, d5, d4
2136 /* source pixel data is in {d0, d1, d2, XX} */
2137 /* destination pixel data is in {d4, d5, d6, XX} */
2139 vmull.u8 q6, d15, d2
2140 vmull.u8 q5, d15, d1
2141 vmull.u8 q4, d15, d0
2144 vmull.u8 q13, d7, d6
2145 vrshr.u16 q12, q6, #8
2146 vrshr.u16 q11, q5, #8
2147 vrshr.u16 q10, q4, #8
2148 vraddhn.u16 d2, q6, q12
2149 vraddhn.u16 d1, q5, q11
2150 vraddhn.u16 d0, q4, q10
2153 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail
2154 vrshr.u16 q14, q8, #8
2155 vrshr.u16 q15, q9, #8
2156 vrshr.u16 q12, q13, #8
2157 vraddhn.u16 d28, q14, q8
2158 vraddhn.u16 d29, q15, q9
2159 vraddhn.u16 d30, q12, q13
2160 vqadd.u8 q0, q0, q14
2161 vqadd.u8 q1, q1, q15
2162 /* 32bpp result is in {d0, d1, d2, XX} */
2163 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2166 /* TODO: expand macros and do better instructions scheduling */
2167 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
2169 pixman_composite_over_0565_8_0565_process_pixblock_tail
2171 vld1.16 {d10, d11}, [DST_R, :128]!
2173 pixman_composite_over_0565_8_0565_process_pixblock_head
2174 vst1.16 {d28, d29}, [DST_W, :128]!
2177 generate_composite_function \
2178 pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
2179 FLAG_DST_READWRITE, \
2180 8, /* number of pixels, processed in a single block */ \
2181 5, /* prefetch distance */ \
2182 default_init_need_all_regs, \
2183 default_cleanup_need_all_regs, \
2184 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2185 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2186 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2187 28, /* dst_w_basereg */ \
2188 10, /* dst_r_basereg */ \
2189 8, /* src_basereg */ \
2190 15 /* mask_basereg */
2192 /******************************************************************************/
2194 .macro pixman_composite_over_0565_n_0565_init
2195 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
2197 vld1.32 {d15[0]}, [DUMMY]
2201 .macro pixman_composite_over_0565_n_0565_cleanup
2205 generate_composite_function \
2206 pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
2207 FLAG_DST_READWRITE, \
2208 8, /* number of pixels, processed in a single block */ \
2209 5, /* prefetch distance */ \
2210 pixman_composite_over_0565_n_0565_init, \
2211 pixman_composite_over_0565_n_0565_cleanup, \
2212 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2213 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2214 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2215 28, /* dst_w_basereg */ \
2216 10, /* dst_r_basereg */ \
2217 8, /* src_basereg */ \
2218 15 /* mask_basereg */
2220 /******************************************************************************/
2222 .macro pixman_composite_add_0565_8_0565_process_pixblock_head
2223 /* mask is in d15 */
2224 convert_0565_to_x888 q4, d2, d1, d0
2225 convert_0565_to_x888 q5, d6, d5, d4
2226 /* source pixel data is in {d0, d1, d2, XX} */
2227 /* destination pixel data is in {d4, d5, d6, XX} */
2228 vmull.u8 q6, d15, d2
2229 vmull.u8 q5, d15, d1
2230 vmull.u8 q4, d15, d0
2231 vrshr.u16 q12, q6, #8
2232 vrshr.u16 q11, q5, #8
2233 vrshr.u16 q10, q4, #8
2234 vraddhn.u16 d2, q6, q12
2235 vraddhn.u16 d1, q5, q11
2236 vraddhn.u16 d0, q4, q10
2239 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail
2242 /* 32bpp result is in {d0, d1, d2, XX} */
2243 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2246 /* TODO: expand macros and do better instructions scheduling */
2247 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
2249 pixman_composite_add_0565_8_0565_process_pixblock_tail
2251 vld1.16 {d10, d11}, [DST_R, :128]!
2253 pixman_composite_add_0565_8_0565_process_pixblock_head
2254 vst1.16 {d28, d29}, [DST_W, :128]!
2257 generate_composite_function \
2258 pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
2259 FLAG_DST_READWRITE, \
2260 8, /* number of pixels, processed in a single block */ \
2261 5, /* prefetch distance */ \
2262 default_init_need_all_regs, \
2263 default_cleanup_need_all_regs, \
2264 pixman_composite_add_0565_8_0565_process_pixblock_head, \
2265 pixman_composite_add_0565_8_0565_process_pixblock_tail, \
2266 pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
2267 28, /* dst_w_basereg */ \
2268 10, /* dst_r_basereg */ \
2269 8, /* src_basereg */ \
2270 15 /* mask_basereg */
2272 /******************************************************************************/
2274 .macro pixman_composite_out_reverse_8_0565_process_pixblock_head
2275 /* mask is in d15 */
2276 convert_0565_to_x888 q5, d6, d5, d4
2277 /* destination pixel data is in {d4, d5, d6, xx} */
2278 vmvn.8 d24, d15 /* get inverted alpha */
2279 /* now do alpha blending */
2280 vmull.u8 q8, d24, d4
2281 vmull.u8 q9, d24, d5
2282 vmull.u8 q10, d24, d6
2285 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
2286 vrshr.u16 q14, q8, #8
2287 vrshr.u16 q15, q9, #8
2288 vrshr.u16 q12, q10, #8
2289 vraddhn.u16 d0, q14, q8
2290 vraddhn.u16 d1, q15, q9
2291 vraddhn.u16 d2, q12, q10
2292 /* 32bpp result is in {d0, d1, d2, XX} */
2293 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2296 /* TODO: expand macros and do better instructions scheduling */
2297 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
2299 pixman_composite_out_reverse_8_0565_process_pixblock_tail
2300 vld1.16 {d10, d11}, [DST_R, :128]!
2302 pixman_composite_out_reverse_8_0565_process_pixblock_head
2303 vst1.16 {d28, d29}, [DST_W, :128]!
2306 generate_composite_function \
2307 pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
2308 FLAG_DST_READWRITE, \
2309 8, /* number of pixels, processed in a single block */ \
2310 5, /* prefetch distance */ \
2311 default_init_need_all_regs, \
2312 default_cleanup_need_all_regs, \
2313 pixman_composite_out_reverse_8_0565_process_pixblock_head, \
2314 pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
2315 pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
2316 28, /* dst_w_basereg */ \
2317 10, /* dst_r_basereg */ \
2318 15, /* src_basereg */ \
2319 0 /* mask_basereg */
2321 /******************************************************************************/
2323 generate_composite_function_nearest_scanline \
2324 pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
2325 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2326 8, /* number of pixels, processed in a single block */ \
2329 pixman_composite_over_8888_8888_process_pixblock_head, \
2330 pixman_composite_over_8888_8888_process_pixblock_tail, \
2331 pixman_composite_over_8888_8888_process_pixblock_tail_head
2333 generate_composite_function_nearest_scanline \
2334 pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
2335 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2336 8, /* number of pixels, processed in a single block */ \
2339 pixman_composite_over_8888_0565_process_pixblock_head, \
2340 pixman_composite_over_8888_0565_process_pixblock_tail, \
2341 pixman_composite_over_8888_0565_process_pixblock_tail_head, \
2342 28, /* dst_w_basereg */ \
2343 4, /* dst_r_basereg */ \
2344 0, /* src_basereg */ \
2345 24 /* mask_basereg */
2347 generate_composite_function_nearest_scanline \
2348 pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
2349 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2350 8, /* number of pixels, processed in a single block */ \
2353 pixman_composite_src_8888_0565_process_pixblock_head, \
2354 pixman_composite_src_8888_0565_process_pixblock_tail, \
2355 pixman_composite_src_8888_0565_process_pixblock_tail_head
2357 generate_composite_function_nearest_scanline \
2358 pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
2359 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2360 8, /* number of pixels, processed in a single block */ \
2363 pixman_composite_src_0565_8888_process_pixblock_head, \
2364 pixman_composite_src_0565_8888_process_pixblock_tail, \
2365 pixman_composite_src_0565_8888_process_pixblock_tail_head
2367 generate_composite_function_nearest_scanline \
2368 pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
2369 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2370 8, /* number of pixels, processed in a single block */ \
2371 default_init_need_all_regs, \
2372 default_cleanup_need_all_regs, \
2373 pixman_composite_over_8888_8_0565_process_pixblock_head, \
2374 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
2375 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
2376 28, /* dst_w_basereg */ \
2377 4, /* dst_r_basereg */ \
2378 8, /* src_basereg */ \
2379 24 /* mask_basereg */
2381 generate_composite_function_nearest_scanline \
2382 pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
2383 FLAG_DST_READWRITE, \
2384 8, /* number of pixels, processed in a single block */ \
2385 default_init_need_all_regs, \
2386 default_cleanup_need_all_regs, \
2387 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2388 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2389 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2390 28, /* dst_w_basereg */ \
2391 10, /* dst_r_basereg */ \
2392 8, /* src_basereg */ \
2393 15 /* mask_basereg */
2395 /******************************************************************************/
2397 /* Supplementary macro for setting function attributes */
2398 .macro pixman_asm_function fname
2403 .type fname, %function
2409 * Bilinear scaling support code which tries to provide pixel fetching, color
2410 * format conversion, and interpolation as separate macros which can be used
2411 * as the basic building blocks for constructing bilinear scanline functions.
2414 .macro bilinear_load_8888 reg1, reg2, tmp
2415 mov TMP2, X, asr #16
2417 add TMP1, TOP, TMP2, asl #2
2418 add TMP2, BOTTOM, TMP2, asl #2
2419 vld1.32 {reg1}, [TMP1]
2420 vld1.32 {reg2}, [TMP2]
2423 .macro bilinear_load_0565 reg1, reg2, tmp
2424 mov TMP2, X, asr #16
2426 add TMP1, TOP, TMP2, asl #1
2427 add TMP2, BOTTOM, TMP2, asl #1
2428 vld1.32 {reg2[0]}, [TMP1]
2429 vld1.32 {reg2[1]}, [TMP2]
2430 convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
2433 .macro bilinear_load_and_vertical_interpolate_two_8888 \
2434 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
2436 bilinear_load_8888 reg1, reg2, tmp1
2437 vmull.u8 acc1, reg1, d28
2438 vmlal.u8 acc1, reg2, d29
2439 bilinear_load_8888 reg3, reg4, tmp2
2440 vmull.u8 acc2, reg3, d28
2441 vmlal.u8 acc2, reg4, d29
2444 .macro bilinear_load_and_vertical_interpolate_four_8888 \
2445 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2446 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2448 bilinear_load_and_vertical_interpolate_two_8888 \
2449 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
2450 bilinear_load_and_vertical_interpolate_two_8888 \
2451 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2454 .macro bilinear_load_and_vertical_interpolate_two_0565 \
2455 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
2457 mov TMP2, X, asr #16
2459 mov TMP4, X, asr #16
2461 add TMP1, TOP, TMP2, asl #1
2462 add TMP2, BOTTOM, TMP2, asl #1
2463 add TMP3, TOP, TMP4, asl #1
2464 add TMP4, BOTTOM, TMP4, asl #1
2465 vld1.32 {acc2lo[0]}, [TMP1]
2466 vld1.32 {acc2hi[0]}, [TMP3]
2467 vld1.32 {acc2lo[1]}, [TMP2]
2468 vld1.32 {acc2hi[1]}, [TMP4]
2469 convert_0565_to_x888 acc2, reg3, reg2, reg1
2474 vmull.u8 acc1, reg1, d28
2475 vmlal.u8 acc1, reg2, d29
2476 vmull.u8 acc2, reg3, d28
2477 vmlal.u8 acc2, reg4, d29
2480 .macro bilinear_load_and_vertical_interpolate_four_0565 \
2481 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2482 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2484 mov TMP2, X, asr #16
2486 mov TMP4, X, asr #16
2488 add TMP1, TOP, TMP2, asl #1
2489 add TMP2, BOTTOM, TMP2, asl #1
2490 add TMP3, TOP, TMP4, asl #1
2491 add TMP4, BOTTOM, TMP4, asl #1
2492 vld1.32 {xacc2lo[0]}, [TMP1]
2493 vld1.32 {xacc2hi[0]}, [TMP3]
2494 vld1.32 {xacc2lo[1]}, [TMP2]
2495 vld1.32 {xacc2hi[1]}, [TMP4]
2496 convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
2497 mov TMP2, X, asr #16
2499 mov TMP4, X, asr #16
2501 add TMP1, TOP, TMP2, asl #1
2502 add TMP2, BOTTOM, TMP2, asl #1
2503 add TMP3, TOP, TMP4, asl #1
2504 add TMP4, BOTTOM, TMP4, asl #1
2505 vld1.32 {yacc2lo[0]}, [TMP1]
2506 vzip.u8 xreg1, xreg3
2507 vld1.32 {yacc2hi[0]}, [TMP3]
2508 vzip.u8 xreg2, xreg4
2509 vld1.32 {yacc2lo[1]}, [TMP2]
2510 vzip.u8 xreg3, xreg4
2511 vld1.32 {yacc2hi[1]}, [TMP4]
2512 vzip.u8 xreg1, xreg2
2513 convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
2514 vmull.u8 xacc1, xreg1, d28
2515 vzip.u8 yreg1, yreg3
2516 vmlal.u8 xacc1, xreg2, d29
2517 vzip.u8 yreg2, yreg4
2518 vmull.u8 xacc2, xreg3, d28
2519 vzip.u8 yreg3, yreg4
2520 vmlal.u8 xacc2, xreg4, d29
2521 vzip.u8 yreg1, yreg2
2522 vmull.u8 yacc1, yreg1, d28
2523 vmlal.u8 yacc1, yreg2, d29
2524 vmull.u8 yacc2, yreg3, d28
2525 vmlal.u8 yacc2, yreg4, d29
2528 .macro bilinear_store_8888 numpix, tmp1, tmp2
2530 vst1.32 {d0, d1}, [OUT]!
2532 vst1.32 {d0}, [OUT]!
2534 vst1.32 {d0[0]}, [OUT, :32]!
2536 .error bilinear_store_8888 numpix is unsupported
2540 .macro bilinear_store_0565 numpix, tmp1, tmp2
2545 convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
2547 vst1.16 {d2}, [OUT]!
2549 vst1.32 {d2[0]}, [OUT]!
2551 vst1.16 {d2[0]}, [OUT]!
2553 .error bilinear_store_0565 numpix is unsupported
2557 .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
2558 bilinear_load_&src_fmt d0, d1, d2
2559 vmull.u8 q1, d0, d28
2560 vmlal.u8 q1, d1, d29
2561 vshr.u16 d30, d24, #8
2562 /* 4 cycles bubble */
2563 vshll.u16 q0, d2, #8
2564 vmlsl.u16 q0, d2, d30
2565 vmlal.u16 q0, d3, d30
2566 /* 5 cycles bubble */
2567 vshrn.u32 d0, q0, #16
2568 /* 3 cycles bubble */
2570 /* 1 cycle bubble */
2571 bilinear_store_&dst_fmt 1, q2, q3
2574 .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
2575 bilinear_load_and_vertical_interpolate_two_&src_fmt \
2576 q1, q11, d0, d1, d20, d21, d22, d23
2577 vshr.u16 q15, q12, #8
2578 vadd.u16 q12, q12, q13
2579 vshll.u16 q0, d2, #8
2580 vmlsl.u16 q0, d2, d30
2581 vmlal.u16 q0, d3, d30
2582 vshll.u16 q10, d22, #8
2583 vmlsl.u16 q10, d22, d31
2584 vmlal.u16 q10, d23, d31
2585 vshrn.u32 d30, q0, #16
2586 vshrn.u32 d31, q10, #16
2588 bilinear_store_&dst_fmt 2, q2, q3
2591 .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
2592 bilinear_load_and_vertical_interpolate_four_&src_fmt \
2593 q1, q11, d0, d1, d20, d21, d22, d23 \
2594 q3, q9, d4, d5, d16, d17, d18, d19
2596 vshr.u16 q15, q12, #8
2597 vadd.u16 q12, q12, q13
2598 vshll.u16 q0, d2, #8
2599 vmlsl.u16 q0, d2, d30
2600 vmlal.u16 q0, d3, d30
2601 vshll.u16 q10, d22, #8
2602 vmlsl.u16 q10, d22, d31
2603 vmlal.u16 q10, d23, d31
2604 vshr.u16 q15, q12, #8
2605 vshll.u16 q2, d6, #8
2606 vmlsl.u16 q2, d6, d30
2607 vmlal.u16 q2, d7, d30
2608 vshll.u16 q8, d18, #8
2610 vmlsl.u16 q8, d18, d31
2611 vmlal.u16 q8, d19, d31
2612 vadd.u16 q12, q12, q13
2613 vshrn.u32 d0, q0, #16
2614 vshrn.u32 d1, q10, #16
2615 vshrn.u32 d4, q2, #16
2616 vshrn.u32 d5, q8, #16
2619 bilinear_store_&dst_fmt 4, q2, q3
2623 * Main template macro for generating NEON optimized bilinear scanline
2626 * TODO: use software pipelining and aligned writes to the destination buffer
2627 * in order to improve performance
2629 * Bilinear scanline scaler macro template uses the following arguments:
2630 * fname - name of the function to generate
2631 * src_fmt - source color format (8888 or 0565)
2632 * dst_fmt - destination color format (8888 or 0565)
2633 * bpp_shift - (1 << bpp_shift) is the size of source pixel in bytes
2634 * prefetch_distance - prefetch in the source image by that many
2638 .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
2639 bpp_shift, prefetch_distance
2641 pixman_asm_function fname
2657 push {r4, r5, r6, r7, r8, r9}
2658 mov PF_OFFS, #prefetch_distance
2659 ldmia ip, {WB, X, UX, WIDTH}
2660 mul PF_OFFS, PF_OFFS, UX
2669 vadd.u16 d25, d25, d26
2670 vadd.u16 q13, q13, q13
2672 subs WIDTH, WIDTH, #4
2674 mov PF_OFFS, PF_OFFS, asr #(16 - bpp_shift)
2676 bilinear_interpolate_four_pixels src_fmt, dst_fmt
2677 subs WIDTH, WIDTH, #4
2682 bilinear_interpolate_two_pixels src_fmt, dst_fmt
2686 bilinear_interpolate_last_pixel src_fmt, dst_fmt
2688 pop {r4, r5, r6, r7, r8, r9}
2708 generate_bilinear_scanline_func \
2709 pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, 2, 28
2711 generate_bilinear_scanline_func \
2712 pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, 2, 28
2714 generate_bilinear_scanline_func \
2715 pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, 1, 28
2717 generate_bilinear_scanline_func \
2718 pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, 1, 28