2 * Copyright © 2011 SCore Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
24 * Author: Taekyun Kim (tkq.kim@samsung.com)
28 * This file contains scaled bilinear scanline functions implemented
29 * using older siarhei's bilinear macro template.
31 * << General scanline function procedures >>
32 * 1. bilinear interpolate source pixels
34 * 3. load destination pixels
35 * 4. duplicate mask to fill whole register
36 * 5. interleave source & destination pixels
37 * 6. apply mask to source pixels
38 * 7. combine source & destination pixels
39 * 8, Deinterleave final result
40 * 9. store destination pixels
42 * All registers with single number (i.e. src0, tmp0) are 64-bits registers.
43 * Registers with double numbers(src01, dst01) are 128-bits registers.
44 * All temp registers can be used freely outside the code block.
45 * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks.
48 * There can be lots of pipeline stalls inside code block and between code blocks.
49 * Further optimizations will be done by new macro templates using head/tail_head/tail scheme.
52 /* Prevent the stack from becoming executable for no reason... */
53 #if defined(__linux__) && defined (__ELF__)
54 .section .note.GNU-stack,"",%progbits
67 #include "pixman-arm-neon-asm.h"
70 * Bilinear macros from pixman-arm-neon-asm.S
73 /* Supplementary macro for setting function attributes */
74 .macro pixman_asm_function fname
79 .type fname, %function
85 * Bilinear scaling support code which tries to provide pixel fetching, color
86 * format conversion, and interpolation as separate macros which can be used
87 * as the basic building blocks for constructing bilinear scanline functions.
90 .macro bilinear_load_8888 reg1, reg2, tmp
93 add TMP1, TOP, TMP1, asl #2
94 vld1.32 {reg1}, [TMP1], STRIDE
95 vld1.32 {reg2}, [TMP1]
98 .macro bilinear_load_0565 reg1, reg2, tmp
101 add TMP1, TOP, TMP1, asl #1
102 vld1.32 {reg2[0]}, [TMP1], STRIDE
103 vld1.32 {reg2[1]}, [TMP1]
104 convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
107 .macro bilinear_load_and_vertical_interpolate_two_8888 \
108 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
110 bilinear_load_8888 reg1, reg2, tmp1
111 vmull.u8 acc1, reg1, d28
112 vmlal.u8 acc1, reg2, d29
113 bilinear_load_8888 reg3, reg4, tmp2
114 vmull.u8 acc2, reg3, d28
115 vmlal.u8 acc2, reg4, d29
118 .macro bilinear_load_and_vertical_interpolate_four_8888 \
119 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
120 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
122 bilinear_load_and_vertical_interpolate_two_8888 \
123 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
124 bilinear_load_and_vertical_interpolate_two_8888 \
125 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
128 .macro bilinear_load_and_vertical_interpolate_two_0565 \
129 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
133 add TMP1, TOP, TMP1, asl #1
136 add TMP2, TOP, TMP2, asl #1
137 vld1.32 {acc2lo[0]}, [TMP1], STRIDE
138 vld1.32 {acc2hi[0]}, [TMP2], STRIDE
139 vld1.32 {acc2lo[1]}, [TMP1]
140 vld1.32 {acc2hi[1]}, [TMP2]
141 convert_0565_to_x888 acc2, reg3, reg2, reg1
146 vmull.u8 acc1, reg1, d28
147 vmlal.u8 acc1, reg2, d29
148 vmull.u8 acc2, reg3, d28
149 vmlal.u8 acc2, reg4, d29
152 .macro bilinear_load_and_vertical_interpolate_four_0565 \
153 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
154 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
158 add TMP1, TOP, TMP1, asl #1
161 add TMP2, TOP, TMP2, asl #1
162 vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
163 vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
164 vld1.32 {xacc2lo[1]}, [TMP1]
165 vld1.32 {xacc2hi[1]}, [TMP2]
166 convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
169 add TMP1, TOP, TMP1, asl #1
172 add TMP2, TOP, TMP2, asl #1
173 vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
175 vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
177 vld1.32 {yacc2lo[1]}, [TMP1]
179 vld1.32 {yacc2hi[1]}, [TMP2]
181 convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
182 vmull.u8 xacc1, xreg1, d28
184 vmlal.u8 xacc1, xreg2, d29
186 vmull.u8 xacc2, xreg3, d28
188 vmlal.u8 xacc2, xreg4, d29
190 vmull.u8 yacc1, yreg1, d28
191 vmlal.u8 yacc1, yreg2, d29
192 vmull.u8 yacc2, yreg3, d28
193 vmlal.u8 yacc2, yreg4, d29
196 .macro bilinear_store_8888 numpix, tmp1, tmp2
198 vst1.32 {d0, d1}, [OUT]!
202 vst1.32 {d0[0]}, [OUT, :32]!
204 .error bilinear_store_8888 numpix is unsupported
208 .macro bilinear_store_0565 numpix, tmp1, tmp2
213 convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
217 vst1.32 {d2[0]}, [OUT]!
219 vst1.16 {d2[0]}, [OUT]!
221 .error bilinear_store_0565 numpix is unsupported
227 * Macros for loading mask pixels into register 'mask'.
228 * vdup must be done in somewhere else.
230 .macro bilinear_load_mask_x numpix, mask
233 .macro bilinear_load_mask_8 numpix, mask
235 vld1.32 {mask[0]}, [MASK]!
237 vld1.16 {mask[0]}, [MASK]!
239 vld1.8 {mask[0]}, [MASK]!
241 .error bilinear_load_mask_8 numpix is unsupported
243 pld [MASK, #prefetch_offset]
246 .macro bilinear_load_mask mask_fmt, numpix, mask
247 bilinear_load_mask_&mask_fmt numpix, mask
252 * Macros for loading destination pixels into register 'dst0' and 'dst1'.
253 * Interleave should be done somewhere else.
255 .macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01
258 .macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01
261 .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
263 vld1.32 {dst0, dst1}, [OUT]
265 vld1.32 {dst0}, [OUT]
267 vld1.32 {dst0[0]}, [OUT]
269 .error bilinear_load_dst_8888 numpix is unsupported
271 pld [OUT, #(prefetch_offset * 4)]
274 .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
275 bilinear_load_dst_8888 numpix, dst0, dst1, dst01
278 .macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
279 bilinear_load_dst_8888 numpix, dst0, dst1, dst01
282 .macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
283 bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01
287 * Macros for duplicating partially loaded mask to fill entire register.
288 * We will apply mask to interleaved source pixels, that is
289 * (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3)
290 * (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3)
291 * So, we need to duplicate loaded mask into whole register.
294 * (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
295 * (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
296 * We can do some optimizations for this including last pixel cases.
298 .macro bilinear_duplicate_mask_x numpix, mask
301 .macro bilinear_duplicate_mask_8 numpix, mask
303 vdup.32 mask, mask[0]
305 vdup.16 mask, mask[0]
309 .error bilinear_duplicate_mask_8 is unsupported
313 .macro bilinear_duplicate_mask mask_fmt, numpix, mask
314 bilinear_duplicate_mask_&mask_fmt numpix, mask
318 * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form.
319 * Interleave should be done when maks is enabled or operator is 'over'.
321 .macro bilinear_interleave src0, src1, dst0, dst1
328 .macro bilinear_interleave_src_dst_x_src \
329 numpix, src0, src1, src01, dst0, dst1, dst01
332 .macro bilinear_interleave_src_dst_x_over \
333 numpix, src0, src1, src01, dst0, dst1, dst01
335 bilinear_interleave src0, src1, dst0, dst1
338 .macro bilinear_interleave_src_dst_x_add \
339 numpix, src0, src1, src01, dst0, dst1, dst01
342 .macro bilinear_interleave_src_dst_8_src \
343 numpix, src0, src1, src01, dst0, dst1, dst01
345 bilinear_interleave src0, src1, dst0, dst1
348 .macro bilinear_interleave_src_dst_8_over \
349 numpix, src0, src1, src01, dst0, dst1, dst01
351 bilinear_interleave src0, src1, dst0, dst1
354 .macro bilinear_interleave_src_dst_8_add \
355 numpix, src0, src1, src01, dst0, dst1, dst01
357 bilinear_interleave src0, src1, dst0, dst1
360 .macro bilinear_interleave_src_dst \
361 mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
363 bilinear_interleave_src_dst_&mask_fmt&_&op \
364 numpix, src0, src1, src01, dst0, dst1, dst01
369 * Macros for applying masks to src pixels. (see combine_mask_u() function)
370 * src, dst should be in interleaved form.
371 * mask register should be in form (m0, m1, m2, m3).
373 .macro bilinear_apply_mask_to_src_x \
374 numpix, src0, src1, src01, mask, \
375 tmp01, tmp23, tmp45, tmp67
378 .macro bilinear_apply_mask_to_src_8 \
379 numpix, src0, src1, src01, mask, \
380 tmp01, tmp23, tmp45, tmp67
382 vmull.u8 tmp01, src0, mask
383 vmull.u8 tmp23, src1, mask
385 vrshr.u16 tmp45, tmp01, #8
386 vrshr.u16 tmp67, tmp23, #8
388 vraddhn.u16 src0, tmp45, tmp01
389 vraddhn.u16 src1, tmp67, tmp23
392 .macro bilinear_apply_mask_to_src \
393 mask_fmt, numpix, src0, src1, src01, mask, \
394 tmp01, tmp23, tmp45, tmp67
396 bilinear_apply_mask_to_src_&mask_fmt \
397 numpix, src0, src1, src01, mask, \
398 tmp01, tmp23, tmp45, tmp67
403 * Macros for combining src and destination pixels.
404 * Interleave or not is depending on operator 'op'.
406 .macro bilinear_combine_src \
407 numpix, src0, src1, src01, dst0, dst1, dst01, \
408 tmp01, tmp23, tmp45, tmp67, tmp8
411 .macro bilinear_combine_over \
412 numpix, src0, src1, src01, dst0, dst1, dst01, \
413 tmp01, tmp23, tmp45, tmp67, tmp8
415 vdup.32 tmp8, src1[1]
419 vmull.u8 tmp01, dst0, tmp8
421 vmull.u8 tmp23, dst1, tmp8
423 vrshr.u16 tmp45, tmp01, #8
424 vrshr.u16 tmp67, tmp23, #8
426 vraddhn.u16 dst0, tmp45, tmp01
427 vraddhn.u16 dst1, tmp67, tmp23
429 vqadd.u8 src01, dst01, src01
432 .macro bilinear_combine_add \
433 numpix, src0, src1, src01, dst0, dst1, dst01, \
434 tmp01, tmp23, tmp45, tmp67, tmp8
436 vqadd.u8 src01, dst01, src01
439 .macro bilinear_combine \
440 op, numpix, src0, src1, src01, dst0, dst1, dst01, \
441 tmp01, tmp23, tmp45, tmp67, tmp8
443 bilinear_combine_&op \
444 numpix, src0, src1, src01, dst0, dst1, dst01, \
445 tmp01, tmp23, tmp45, tmp67, tmp8
449 * Macros for final deinterleaving of destination pixels if needed.
451 .macro bilinear_deinterleave numpix, dst0, dst1, dst01
457 .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
460 .macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
461 bilinear_deinterleave numpix, dst0, dst1, dst01
464 .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
467 .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
468 bilinear_deinterleave numpix, dst0, dst1, dst01
471 .macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
472 bilinear_deinterleave numpix, dst0, dst1, dst01
475 .macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
476 bilinear_deinterleave numpix, dst0, dst1, dst01
479 .macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
480 bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01
484 .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
485 bilinear_load_&src_fmt d0, d1, d2
486 bilinear_load_mask mask_fmt, 1, d4
487 bilinear_load_dst dst_fmt, op, 1, d18, d19, q9
490 /* 5 cycles bubble */
492 vmlsl.u16 q0, d2, d30
493 vmlal.u16 q0, d3, d30
494 /* 5 cycles bubble */
495 bilinear_duplicate_mask mask_fmt, 1, d4
496 vshrn.u32 d0, q0, #16
497 /* 3 cycles bubble */
500 bilinear_interleave_src_dst \
501 mask_fmt, op, 1, d0, d1, q0, d18, d19, q9
502 bilinear_apply_mask_to_src \
503 mask_fmt, 1, d0, d1, q0, d4, \
506 op, 1, d0, d1, q0, d18, d19, q9, \
508 bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0
509 bilinear_store_&dst_fmt 1, q2, q3
512 .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
513 bilinear_load_and_vertical_interpolate_two_&src_fmt \
514 q1, q11, d0, d1, d20, d21, d22, d23
515 bilinear_load_mask mask_fmt, 2, d4
516 bilinear_load_dst dst_fmt, op, 2, d18, d19, q9
518 vmlsl.u16 q0, d2, d30
519 vmlal.u16 q0, d3, d30
520 vshll.u16 q10, d22, #8
521 vmlsl.u16 q10, d22, d31
522 vmlal.u16 q10, d23, d31
523 vshrn.u32 d0, q0, #16
524 vshrn.u32 d1, q10, #16
525 bilinear_duplicate_mask mask_fmt, 2, d4
526 vshr.u16 q15, q12, #8
527 vadd.u16 q12, q12, q13
529 bilinear_interleave_src_dst \
530 mask_fmt, op, 2, d0, d1, q0, d18, d19, q9
531 bilinear_apply_mask_to_src \
532 mask_fmt, 2, d0, d1, q0, d4, \
535 op, 2, d0, d1, q0, d18, d19, q9, \
537 bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0
538 bilinear_store_&dst_fmt 2, q2, q3
541 .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
542 bilinear_load_and_vertical_interpolate_four_&src_fmt \
543 q1, q11, d0, d1, d20, d21, d22, d23 \
544 q3, q9, d4, d5, d16, d17, d18, d19
546 sub TMP1, TMP1, STRIDE
548 vmlsl.u16 q0, d2, d30
549 vmlal.u16 q0, d3, d30
550 vshll.u16 q10, d22, #8
551 vmlsl.u16 q10, d22, d31
552 vmlal.u16 q10, d23, d31
553 vshr.u16 q15, q12, #8
555 vmlsl.u16 q2, d6, d30
556 vmlal.u16 q2, d7, d30
557 vshll.u16 q8, d18, #8
558 bilinear_load_mask mask_fmt, 4, d22
559 bilinear_load_dst dst_fmt, op, 4, d2, d3, q1
561 vmlsl.u16 q8, d18, d31
562 vmlal.u16 q8, d19, d31
563 vadd.u16 q12, q12, q13
564 vshrn.u32 d0, q0, #16
565 vshrn.u32 d1, q10, #16
566 vshrn.u32 d4, q2, #16
567 vshrn.u32 d5, q8, #16
568 bilinear_duplicate_mask mask_fmt, 4, d22
569 vshr.u16 q15, q12, #8
572 vadd.u16 q12, q12, q13
573 bilinear_interleave_src_dst \
574 mask_fmt, op, 4, d0, d1, q0, d2, d3, q1
575 bilinear_apply_mask_to_src \
576 mask_fmt, 4, d0, d1, q0, d22, \
579 op, 4, d0, d1, q0, d2, d3, q1, \
581 bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0
582 bilinear_store_&dst_fmt 4, q2, q3
585 .set BILINEAR_FLAG_USE_MASK, 1
586 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
589 * Main template macro for generating NEON optimized bilinear scanline functions.
591 * Bilinear scanline generator macro take folling arguments:
592 * fname - name of the function to generate
593 * src_fmt - source color format (8888 or 0565)
594 * dst_fmt - destination color format (8888 or 0565)
595 * src/dst_bpp_shift - (1 << bpp_shift) is the size of src/dst pixel in bytes
596 * process_last_pixel - code block that interpolate one pixel and does not
597 * update horizontal weight
598 * process_two_pixels - code block that interpolate two pixels and update
600 * process_four_pixels - code block that interpolate four pixels and update
602 * process_pixblock_head - head part of middle loop
603 * process_pixblock_tail - tail part of middle loop
604 * process_pixblock_tail_head - tail_head of middle loop
605 * pixblock_size - number of pixels processed in a single middle loop
606 * prefetch_distance - prefetch in the source image by that many pixels ahead
609 .macro generate_bilinear_scanline_func \
611 src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \
612 bilinear_process_last_pixel, \
613 bilinear_process_two_pixels, \
614 bilinear_process_four_pixels, \
615 bilinear_process_pixblock_head, \
616 bilinear_process_pixblock_tail, \
617 bilinear_process_pixblock_tail_head, \
622 pixman_asm_function fname
623 .if pixblock_size == 8
624 .elseif pixblock_size == 4
626 .error unsupported pixblock size
629 .if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
646 push {r4, r5, r6, r7, r8, r9}
647 mov PF_OFFS, #prefetch_distance
648 ldmia ip, {WB, X, UX, WIDTH}
666 .set prefetch_offset, prefetch_distance
669 push {r4, r5, r6, r7, r8, r9, r10, ip}
670 mov PF_OFFS, #prefetch_distance
671 ldmia ip, {WT, WB, X, UX, WIDTH}
674 mul PF_OFFS, PF_OFFS, UX
676 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
680 sub STRIDE, BOTTOM, TOP
690 vadd.u16 d25, d25, d26
692 /* ensure good destination alignment */
695 tst OUT, #(1 << dst_bpp_shift)
697 vshr.u16 q15, q12, #8
698 vadd.u16 q12, q12, q13
699 bilinear_process_last_pixel
702 vadd.u16 q13, q13, q13
703 vshr.u16 q15, q12, #8
704 vadd.u16 q12, q12, q13
708 tst OUT, #(1 << (dst_bpp_shift + 1))
710 bilinear_process_two_pixels
713 .if pixblock_size == 8
716 tst OUT, #(1 << (dst_bpp_shift + 2))
718 bilinear_process_four_pixels
722 subs WIDTH, WIDTH, #pixblock_size
724 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
725 bilinear_process_pixblock_head
726 subs WIDTH, WIDTH, #pixblock_size
729 bilinear_process_pixblock_tail_head
730 subs WIDTH, WIDTH, #pixblock_size
733 bilinear_process_pixblock_tail
735 .if pixblock_size == 8
738 bilinear_process_four_pixels
741 /* handle the remaining trailing pixels */
744 bilinear_process_two_pixels
748 bilinear_process_last_pixel
750 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
754 .if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
755 pop {r4, r5, r6, r7, r8, r9}
757 pop {r4, r5, r6, r7, r8, r9, r10, ip}
774 .if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
782 /* src_8888_8_8888 */
783 .macro bilinear_src_8888_8_8888_process_last_pixel
784 bilinear_interpolate_last_pixel 8888, 8, 8888, src
787 .macro bilinear_src_8888_8_8888_process_two_pixels
788 bilinear_interpolate_two_pixels 8888, 8, 8888, src
791 .macro bilinear_src_8888_8_8888_process_four_pixels
792 bilinear_interpolate_four_pixels 8888, 8, 8888, src
795 .macro bilinear_src_8888_8_8888_process_pixblock_head
796 bilinear_src_8888_8_8888_process_four_pixels
799 .macro bilinear_src_8888_8_8888_process_pixblock_tail
802 .macro bilinear_src_8888_8_8888_process_pixblock_tail_head
803 bilinear_src_8888_8_8888_process_pixblock_tail
804 bilinear_src_8888_8_8888_process_pixblock_head
807 /* src_8888_8_0565 */
808 .macro bilinear_src_8888_8_0565_process_last_pixel
809 bilinear_interpolate_last_pixel 8888, 8, 0565, src
812 .macro bilinear_src_8888_8_0565_process_two_pixels
813 bilinear_interpolate_two_pixels 8888, 8, 0565, src
816 .macro bilinear_src_8888_8_0565_process_four_pixels
817 bilinear_interpolate_four_pixels 8888, 8, 0565, src
820 .macro bilinear_src_8888_8_0565_process_pixblock_head
821 bilinear_src_8888_8_0565_process_four_pixels
824 .macro bilinear_src_8888_8_0565_process_pixblock_tail
827 .macro bilinear_src_8888_8_0565_process_pixblock_tail_head
828 bilinear_src_8888_8_0565_process_pixblock_tail
829 bilinear_src_8888_8_0565_process_pixblock_head
832 /* src_0565_8_x888 */
833 .macro bilinear_src_0565_8_x888_process_last_pixel
834 bilinear_interpolate_last_pixel 0565, 8, 8888, src
837 .macro bilinear_src_0565_8_x888_process_two_pixels
838 bilinear_interpolate_two_pixels 0565, 8, 8888, src
841 .macro bilinear_src_0565_8_x888_process_four_pixels
842 bilinear_interpolate_four_pixels 0565, 8, 8888, src
845 .macro bilinear_src_0565_8_x888_process_pixblock_head
846 bilinear_src_0565_8_x888_process_four_pixels
849 .macro bilinear_src_0565_8_x888_process_pixblock_tail
852 .macro bilinear_src_0565_8_x888_process_pixblock_tail_head
853 bilinear_src_0565_8_x888_process_pixblock_tail
854 bilinear_src_0565_8_x888_process_pixblock_head
857 /* src_0565_8_0565 */
858 .macro bilinear_src_0565_8_0565_process_last_pixel
859 bilinear_interpolate_last_pixel 0565, 8, 0565, src
862 .macro bilinear_src_0565_8_0565_process_two_pixels
863 bilinear_interpolate_two_pixels 0565, 8, 0565, src
866 .macro bilinear_src_0565_8_0565_process_four_pixels
867 bilinear_interpolate_four_pixels 0565, 8, 0565, src
870 .macro bilinear_src_0565_8_0565_process_pixblock_head
871 bilinear_src_0565_8_0565_process_four_pixels
874 .macro bilinear_src_0565_8_0565_process_pixblock_tail
877 .macro bilinear_src_0565_8_0565_process_pixblock_tail_head
878 bilinear_src_0565_8_0565_process_pixblock_tail
879 bilinear_src_0565_8_0565_process_pixblock_head
883 .macro bilinear_over_8888_8888_process_last_pixel
884 bilinear_interpolate_last_pixel 8888, x, 8888, over
887 .macro bilinear_over_8888_8888_process_two_pixels
888 bilinear_interpolate_two_pixels 8888, x, 8888, over
891 .macro bilinear_over_8888_8888_process_four_pixels
892 bilinear_interpolate_four_pixels 8888, x, 8888, over
895 .macro bilinear_over_8888_8888_process_pixblock_head
898 add TMP1, TOP, TMP1, asl #2
901 add TMP2, TOP, TMP2, asl #2
903 vld1.32 {d22}, [TMP1], STRIDE
904 vld1.32 {d23}, [TMP1]
907 add TMP3, TOP, TMP3, asl #2
908 vmull.u8 q8, d22, d28
909 vmlal.u8 q8, d23, d29
911 vld1.32 {d22}, [TMP2], STRIDE
912 vld1.32 {d23}, [TMP2]
915 add TMP4, TOP, TMP4, asl #2
916 vmull.u8 q9, d22, d28
917 vmlal.u8 q9, d23, d29
919 vld1.32 {d22}, [TMP3], STRIDE
920 vld1.32 {d23}, [TMP3]
921 vmull.u8 q10, d22, d28
922 vmlal.u8 q10, d23, d29
924 vshll.u16 q0, d16, #8
925 vmlsl.u16 q0, d16, d30
926 vmlal.u16 q0, d17, d30
929 vld1.32 {d16}, [TMP4], STRIDE
930 vld1.32 {d17}, [TMP4]
932 vmull.u8 q11, d16, d28
933 vmlal.u8 q11, d17, d29
935 vshll.u16 q1, d18, #8
936 vmlsl.u16 q1, d18, d31
937 vmlal.u16 q1, d19, d31
938 vshr.u16 q15, q12, #8
939 vadd.u16 q12, q12, q13
942 .macro bilinear_over_8888_8888_process_pixblock_tail
943 vshll.u16 q2, d20, #8
944 vmlsl.u16 q2, d20, d30
945 vmlal.u16 q2, d21, d30
946 vshll.u16 q3, d22, #8
947 vmlsl.u16 q3, d22, d31
948 vmlal.u16 q3, d23, d31
949 vshrn.u32 d0, q0, #16
950 vshrn.u32 d1, q1, #16
951 vld1.32 {d2, d3}, [OUT, :128]
952 pld [OUT, #(prefetch_offset * 4)]
953 vshrn.u32 d4, q2, #16
954 vshr.u16 q15, q12, #8
955 vshrn.u32 d5, q3, #16
966 vrshr.u16 q1, q11, #8
967 vrshr.u16 q10, q2, #8
968 vraddhn.u16 d2, q1, q11
969 vraddhn.u16 d3, q10, q2
973 vadd.u16 q12, q12, q13
974 vst1.32 {d6, d7}, [OUT, :128]!
977 .macro bilinear_over_8888_8888_process_pixblock_tail_head
978 vshll.u16 q2, d20, #8
981 add TMP1, TOP, TMP1, asl #2
982 vmlsl.u16 q2, d20, d30
985 add TMP2, TOP, TMP2, asl #2
986 vmlal.u16 q2, d21, d30
987 vshll.u16 q3, d22, #8
988 vld1.32 {d20}, [TMP1], STRIDE
989 vmlsl.u16 q3, d22, d31
990 vmlal.u16 q3, d23, d31
991 vld1.32 {d21}, [TMP1]
992 vmull.u8 q8, d20, d28
993 vmlal.u8 q8, d21, d29
994 vshrn.u32 d0, q0, #16
995 vshrn.u32 d1, q1, #16
996 vld1.32 {d2, d3}, [OUT, :128]
998 vshrn.u32 d4, q2, #16
999 vshr.u16 q15, q12, #8
1000 vld1.32 {d22}, [TMP2], STRIDE
1001 vshrn.u32 d5, q3, #16
1003 vld1.32 {d23}, [TMP2]
1004 vmull.u8 q9, d22, d28
1005 mov TMP3, X, asr #16
1007 add TMP3, TOP, TMP3, asl #2
1008 mov TMP4, X, asr #16
1010 add TMP4, TOP, TMP4, asl #2
1011 vmlal.u8 q9, d23, d29
1013 vld1.32 {d22}, [TMP3], STRIDE
1019 vld1.32 {d23}, [TMP3]
1021 vmull.u8 q10, d22, d28
1022 vmlal.u8 q10, d23, d29
1023 vmull.u8 q11, d2, d4
1025 vshll.u16 q0, d16, #8
1026 vmlsl.u16 q0, d16, d30
1027 vrshr.u16 q1, q11, #8
1028 vmlal.u16 q0, d17, d30
1029 vrshr.u16 q8, q2, #8
1030 vraddhn.u16 d2, q1, q11
1031 vraddhn.u16 d3, q8, q2
1033 vld1.32 {d16}, [TMP4], STRIDE
1035 vld1.32 {d17}, [TMP4]
1037 vmull.u8 q11, d16, d28
1038 vmlal.u8 q11, d17, d29
1040 vshll.u16 q1, d18, #8
1042 vmlsl.u16 q1, d18, d31
1043 vadd.u16 q12, q12, q13
1044 vmlal.u16 q1, d19, d31
1045 vshr.u16 q15, q12, #8
1046 vadd.u16 q12, q12, q13
1047 vst1.32 {d6, d7}, [OUT, :128]!
1050 /* over_8888_8_8888 */
1051 .macro bilinear_over_8888_8_8888_process_last_pixel
1052 bilinear_interpolate_last_pixel 8888, 8, 8888, over
1055 .macro bilinear_over_8888_8_8888_process_two_pixels
1056 bilinear_interpolate_two_pixels 8888, 8, 8888, over
1059 .macro bilinear_over_8888_8_8888_process_four_pixels
1060 bilinear_interpolate_four_pixels 8888, 8, 8888, over
1063 .macro bilinear_over_8888_8_8888_process_pixblock_head
1064 mov TMP1, X, asr #16
1066 add TMP1, TOP, TMP1, asl #2
1067 vld1.32 {d0}, [TMP1], STRIDE
1068 mov TMP2, X, asr #16
1070 add TMP2, TOP, TMP2, asl #2
1071 vld1.32 {d1}, [TMP1]
1072 mov TMP3, X, asr #16
1074 add TMP3, TOP, TMP3, asl #2
1075 vld1.32 {d2}, [TMP2], STRIDE
1076 mov TMP4, X, asr #16
1078 add TMP4, TOP, TMP4, asl #2
1079 vld1.32 {d3}, [TMP2]
1080 vmull.u8 q2, d0, d28
1081 vmull.u8 q3, d2, d28
1082 vmlal.u8 q2, d1, d29
1083 vmlal.u8 q3, d3, d29
1084 vshll.u16 q0, d4, #8
1085 vshll.u16 q1, d6, #8
1086 vmlsl.u16 q0, d4, d30
1087 vmlsl.u16 q1, d6, d31
1088 vmlal.u16 q0, d5, d30
1089 vmlal.u16 q1, d7, d31
1090 vshrn.u32 d0, q0, #16
1091 vshrn.u32 d1, q1, #16
1092 vld1.32 {d2}, [TMP3], STRIDE
1093 vld1.32 {d3}, [TMP3]
1095 vld1.32 {d4}, [TMP4], STRIDE
1096 vld1.32 {d5}, [TMP4]
1098 vmull.u8 q3, d2, d28
1099 vmlal.u8 q3, d3, d29
1100 vmull.u8 q1, d4, d28
1101 vmlal.u8 q1, d5, d29
1102 vshr.u16 q15, q12, #8
1103 vld1.32 {d22[0]}, [MASK]!
1104 pld [MASK, #prefetch_offset]
1105 vadd.u16 q12, q12, q13
1109 .macro bilinear_over_8888_8_8888_process_pixblock_tail
1110 vshll.u16 q9, d6, #8
1111 vshll.u16 q10, d2, #8
1112 vmlsl.u16 q9, d6, d30
1113 vmlsl.u16 q10, d2, d31
1114 vmlal.u16 q9, d7, d30
1115 vmlal.u16 q10, d3, d31
1116 vshr.u16 q15, q12, #8
1117 vadd.u16 q12, q12, q13
1119 vshrn.u32 d18, q9, #16
1120 vshrn.u32 d19, q10, #16
1122 vld1.32 {d18, d19}, [OUT, :128]
1128 vmull.u8 q10, d16, d22
1129 vmull.u8 q11, d17, d22
1130 vrsra.u16 q10, q10, #8
1131 vrsra.u16 q11, q11, #8
1132 vrshrn.u16 d16, q10, #8
1133 vrshrn.u16 d17, q11, #8
1136 vmull.u8 q10, d18, d22
1137 vmull.u8 q11, d19, d22
1138 vrshr.u16 q9, q10, #8
1139 vrshr.u16 q0, q11, #8
1140 vraddhn.u16 d18, q9, q10
1141 vraddhn.u16 d19, q0, q11
1145 vst1.32 {d18, d19}, [OUT, :128]!
1148 .macro bilinear_over_8888_8_8888_process_pixblock_tail_head
1149 vshll.u16 q9, d6, #8
1150 mov TMP1, X, asr #16
1152 add TMP1, TOP, TMP1, asl #2
1153 vshll.u16 q10, d2, #8
1154 vld1.32 {d0}, [TMP1], STRIDE
1155 mov TMP2, X, asr #16
1157 add TMP2, TOP, TMP2, asl #2
1158 vmlsl.u16 q9, d6, d30
1159 vmlsl.u16 q10, d2, d31
1160 vld1.32 {d1}, [TMP1]
1161 mov TMP3, X, asr #16
1163 add TMP3, TOP, TMP3, asl #2
1164 vmlal.u16 q9, d7, d30
1165 vmlal.u16 q10, d3, d31
1166 vld1.32 {d2}, [TMP2], STRIDE
1167 mov TMP4, X, asr #16
1169 add TMP4, TOP, TMP4, asl #2
1170 vshr.u16 q15, q12, #8
1171 vadd.u16 q12, q12, q13
1172 vld1.32 {d3}, [TMP2]
1174 vshrn.u32 d18, q9, #16
1175 vshrn.u32 d19, q10, #16
1176 vmull.u8 q2, d0, d28
1177 vmull.u8 q3, d2, d28
1179 vld1.32 {d18, d19}, [OUT, :128]
1180 pld [OUT, #(prefetch_offset * 4)]
1181 vmlal.u8 q2, d1, d29
1182 vmlal.u8 q3, d3, d29
1185 vshll.u16 q0, d4, #8
1186 vshll.u16 q1, d6, #8
1189 vmlsl.u16 q0, d4, d30
1190 vmlsl.u16 q1, d6, d31
1191 vmull.u8 q10, d16, d22
1192 vmull.u8 q11, d17, d22
1193 vmlal.u16 q0, d5, d30
1194 vmlal.u16 q1, d7, d31
1195 vrsra.u16 q10, q10, #8
1196 vrsra.u16 q11, q11, #8
1197 vshrn.u32 d0, q0, #16
1198 vshrn.u32 d1, q1, #16
1199 vrshrn.u16 d16, q10, #8
1200 vrshrn.u16 d17, q11, #8
1201 vld1.32 {d2}, [TMP3], STRIDE
1203 vld1.32 {d3}, [TMP3]
1206 vld1.32 {d4}, [TMP4], STRIDE
1207 vmull.u8 q10, d18, d22
1208 vmull.u8 q11, d19, d22
1209 vld1.32 {d5}, [TMP4]
1211 vmull.u8 q3, d2, d28
1212 vrshr.u16 q9, q10, #8
1213 vrshr.u16 q15, q11, #8
1214 vmlal.u8 q3, d3, d29
1215 vmull.u8 q1, d4, d28
1216 vraddhn.u16 d18, q9, q10
1217 vraddhn.u16 d19, q15, q11
1218 vmlal.u8 q1, d5, d29
1219 vshr.u16 q15, q12, #8
1221 vld1.32 {d22[0]}, [MASK]!
1223 vadd.u16 q12, q12, q13
1226 vst1.32 {d18, d19}, [OUT, :128]!
1230 .macro bilinear_add_8888_8888_process_last_pixel
1231 bilinear_interpolate_last_pixel 8888, x, 8888, add
1234 .macro bilinear_add_8888_8888_process_two_pixels
1235 bilinear_interpolate_two_pixels 8888, x, 8888, add
1238 .macro bilinear_add_8888_8888_process_four_pixels
1239 bilinear_interpolate_four_pixels 8888, x, 8888, add
1242 .macro bilinear_add_8888_8888_process_pixblock_head
1243 bilinear_add_8888_8888_process_four_pixels
1246 .macro bilinear_add_8888_8888_process_pixblock_tail
1249 .macro bilinear_add_8888_8888_process_pixblock_tail_head
1250 bilinear_add_8888_8888_process_pixblock_tail
1251 bilinear_add_8888_8888_process_pixblock_head
1254 /* add_8888_8_8888 */
1255 .macro bilinear_add_8888_8_8888_process_last_pixel
1256 bilinear_interpolate_last_pixel 8888, 8, 8888, add
1259 .macro bilinear_add_8888_8_8888_process_two_pixels
1260 bilinear_interpolate_two_pixels 8888, 8, 8888, add
1263 .macro bilinear_add_8888_8_8888_process_four_pixels
1264 bilinear_interpolate_four_pixels 8888, 8, 8888, add
1267 .macro bilinear_add_8888_8_8888_process_pixblock_head
1268 bilinear_add_8888_8_8888_process_four_pixels
1271 .macro bilinear_add_8888_8_8888_process_pixblock_tail
1274 .macro bilinear_add_8888_8_8888_process_pixblock_tail_head
1275 bilinear_add_8888_8_8888_process_pixblock_tail
1276 bilinear_add_8888_8_8888_process_pixblock_head
1280 /* Bilinear scanline functions */
1281 generate_bilinear_scanline_func \
1282 pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \
1284 bilinear_src_8888_8_8888_process_last_pixel, \
1285 bilinear_src_8888_8_8888_process_two_pixels, \
1286 bilinear_src_8888_8_8888_process_four_pixels, \
1287 bilinear_src_8888_8_8888_process_pixblock_head, \
1288 bilinear_src_8888_8_8888_process_pixblock_tail, \
1289 bilinear_src_8888_8_8888_process_pixblock_tail_head, \
1290 4, 28, BILINEAR_FLAG_USE_MASK
1292 generate_bilinear_scanline_func \
1293 pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \
1295 bilinear_src_8888_8_0565_process_last_pixel, \
1296 bilinear_src_8888_8_0565_process_two_pixels, \
1297 bilinear_src_8888_8_0565_process_four_pixels, \
1298 bilinear_src_8888_8_0565_process_pixblock_head, \
1299 bilinear_src_8888_8_0565_process_pixblock_tail, \
1300 bilinear_src_8888_8_0565_process_pixblock_tail_head, \
1301 4, 28, BILINEAR_FLAG_USE_MASK
1303 generate_bilinear_scanline_func \
1304 pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \
1306 bilinear_src_0565_8_x888_process_last_pixel, \
1307 bilinear_src_0565_8_x888_process_two_pixels, \
1308 bilinear_src_0565_8_x888_process_four_pixels, \
1309 bilinear_src_0565_8_x888_process_pixblock_head, \
1310 bilinear_src_0565_8_x888_process_pixblock_tail, \
1311 bilinear_src_0565_8_x888_process_pixblock_tail_head, \
1312 4, 28, BILINEAR_FLAG_USE_MASK
1314 generate_bilinear_scanline_func \
1315 pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \
1317 bilinear_src_0565_8_0565_process_last_pixel, \
1318 bilinear_src_0565_8_0565_process_two_pixels, \
1319 bilinear_src_0565_8_0565_process_four_pixels, \
1320 bilinear_src_0565_8_0565_process_pixblock_head, \
1321 bilinear_src_0565_8_0565_process_pixblock_tail, \
1322 bilinear_src_0565_8_0565_process_pixblock_tail_head, \
1323 4, 28, BILINEAR_FLAG_USE_MASK
1325 generate_bilinear_scanline_func \
1326 pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \
1328 bilinear_over_8888_8888_process_last_pixel, \
1329 bilinear_over_8888_8888_process_two_pixels, \
1330 bilinear_over_8888_8888_process_four_pixels, \
1331 bilinear_over_8888_8888_process_pixblock_head, \
1332 bilinear_over_8888_8888_process_pixblock_tail, \
1333 bilinear_over_8888_8888_process_pixblock_tail_head, \
1336 generate_bilinear_scanline_func \
1337 pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \
1339 bilinear_over_8888_8_8888_process_last_pixel, \
1340 bilinear_over_8888_8_8888_process_two_pixels, \
1341 bilinear_over_8888_8_8888_process_four_pixels, \
1342 bilinear_over_8888_8_8888_process_pixblock_head, \
1343 bilinear_over_8888_8_8888_process_pixblock_tail, \
1344 bilinear_over_8888_8_8888_process_pixblock_tail_head, \
1345 4, 28, BILINEAR_FLAG_USE_MASK
1347 generate_bilinear_scanline_func \
1348 pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \
1350 bilinear_add_8888_8888_process_last_pixel, \
1351 bilinear_add_8888_8888_process_two_pixels, \
1352 bilinear_add_8888_8888_process_four_pixels, \
1353 bilinear_add_8888_8888_process_pixblock_head, \
1354 bilinear_add_8888_8888_process_pixblock_tail, \
1355 bilinear_add_8888_8888_process_pixblock_tail_head, \
1358 generate_bilinear_scanline_func \
1359 pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
1361 bilinear_add_8888_8_8888_process_last_pixel, \
1362 bilinear_add_8888_8_8888_process_two_pixels, \
1363 bilinear_add_8888_8_8888_process_four_pixels, \
1364 bilinear_add_8888_8_8888_process_pixblock_head, \
1365 bilinear_add_8888_8_8888_process_pixblock_tail, \
1366 bilinear_add_8888_8_8888_process_pixblock_tail_head, \
1367 4, 28, BILINEAR_FLAG_USE_MASK