2 * Copyright © 2009 ARM Ltd, Movial Creative Technologies Oy
4 * Permission to use, copy, modify, distribute, and sell this software and its
5 * documentation for any purpose is hereby granted without fee, provided that
6 * the above copyright notice appear in all copies and that both that
7 * copyright notice and this permission notice appear in supporting
8 * documentation, and that the name of ARM Ltd not be used in
9 * advertising or publicity pertaining to distribution of the software without
10 * specific, written prior permission. ARM Ltd makes no
11 * representations about the suitability of this software for any purpose. It
12 * is provided "as is" without express or implied warranty.
14 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
15 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
16 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
17 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
19 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
20 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23 * Author: Ian Rickards (ian.rickards@arm.com)
24 * Author: Jonathan Morton (jonathan.morton@movial.com)
25 * Author: Markku Vire (markku.vire@movial.com)
35 #include "pixman-private.h"
37 /* Deal with an intrinsic that is defined differently in GCC */
38 #if !defined(__ARMCC_VERSION) && !defined(__pld)
39 #define __pld(_x) __builtin_prefetch (_x)
42 static force_inline uint8x8x4_t
43 unpack0565 (uint16x8_t rgb)
48 res.val[3] = vdup_n_u8 (0);
49 gb = vshrq_n_u16 (rgb, 5);
50 b = vshrq_n_u16 (rgb, 5 + 6);
52 res.val[0] = vmovn_u16 (rgb); /* get low 5 bits */
53 res.val[1] = vmovn_u16 (gb); /* get mid 6 bits */
54 res.val[2] = vmovn_u16 (b); /* get top 5 bits */
56 res.val[0] = vshl_n_u8 (res.val[0], 3); /* shift to top */
57 res.val[1] = vshl_n_u8 (res.val[1], 2); /* shift to top */
58 res.val[2] = vshl_n_u8 (res.val[2], 3); /* shift to top */
60 res.val[0] = vsri_n_u8 (res.val[0], res.val[0], 5);
61 res.val[1] = vsri_n_u8 (res.val[1], res.val[1], 6);
62 res.val[2] = vsri_n_u8 (res.val[2], res.val[2], 5);
67 static force_inline uint16x8_t
68 pack0565 (uint8x8x4_t s)
70 uint16x8_t rgb, val_g, val_r;
72 rgb = vshll_n_u8 (s.val[2], 8);
73 val_g = vshll_n_u8 (s.val[1], 8);
74 val_r = vshll_n_u8 (s.val[0], 8);
75 rgb = vsriq_n_u16 (rgb, val_g, 5);
76 rgb = vsriq_n_u16 (rgb, val_r, 5 + 6);
81 static force_inline uint8x8_t
82 neon2mul (uint8x8_t x,
88 tmp = vmull_u8 (x, alpha);
89 tmp2 = vrshrq_n_u16 (tmp, 8);
90 res = vraddhn_u16 (tmp, tmp2);
95 static force_inline uint8x8x4_t
96 neon8mul (uint8x8x4_t x,
101 uint16x8_t qtmp1, qtmp2;
103 tmp.val[0] = vmull_u8 (x.val[0], alpha);
104 tmp.val[1] = vmull_u8 (x.val[1], alpha);
105 tmp.val[2] = vmull_u8 (x.val[2], alpha);
106 tmp.val[3] = vmull_u8 (x.val[3], alpha);
108 qtmp1 = vrshrq_n_u16 (tmp.val[0], 8);
109 qtmp2 = vrshrq_n_u16 (tmp.val[1], 8);
110 res.val[0] = vraddhn_u16 (tmp.val[0], qtmp1);
111 qtmp1 = vrshrq_n_u16 (tmp.val[2], 8);
112 res.val[1] = vraddhn_u16 (tmp.val[1], qtmp2);
113 qtmp2 = vrshrq_n_u16 (tmp.val[3], 8);
114 res.val[2] = vraddhn_u16 (tmp.val[2], qtmp1);
115 res.val[3] = vraddhn_u16 (tmp.val[3], qtmp2);
120 static force_inline uint8x8x4_t
121 neon8qadd (uint8x8x4_t x,
126 res.val[0] = vqadd_u8 (x.val[0], y.val[0]);
127 res.val[1] = vqadd_u8 (x.val[1], y.val[1]);
128 res.val[2] = vqadd_u8 (x.val[2], y.val[2]);
129 res.val[3] = vqadd_u8 (x.val[3], y.val[3]);
135 neon_composite_add_8000_8000 (pixman_implementation_t * impl,
137 pixman_image_t * src_image,
138 pixman_image_t * mask_image,
139 pixman_image_t * dst_image,
149 uint8_t *dst_line, *dst;
150 uint8_t *src_line, *src;
151 int dst_stride, src_stride;
154 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
155 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
159 /* Use overlapping 8-pixel method */
162 uint8_t *keep_dst = 0;
163 uint8x8_t sval, dval, temp;
166 dst_line += dst_stride;
168 src_line += src_stride;
171 #ifndef USE_GCC_INLINE_ASM
172 sval = vld1_u8 ((void*)src);
173 dval = vld1_u8 ((void*)dst);
176 temp = vqadd_u8 (dval, sval);
184 sval = vld1_u8 ((void*)src);
185 dval = vld1_u8 ((void*)dst);
187 vst1_u8 ((void*)keep_dst, temp);
190 temp = vqadd_u8 (dval, sval);
197 vst1_u8 ((void*)keep_dst, temp);
200 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
201 "vld1.8 {d0}, [%[src]]\n\t"
202 "vld1.8 {d4}, [%[dst]]\n\t"
203 "mov %[keep_dst], %[dst]\n\t"
205 "and ip, %[w], #7\n\t"
206 "add %[src], %[src], ip\n\t"
207 "add %[dst], %[dst], ip\n\t"
208 "subs %[w], %[w], ip\n\t"
212 "vld1.8 {d0}, [%[src]]!\n\t"
213 "vld1.8 {d4}, [%[dst]]!\n\t"
214 "vst1.8 {d20}, [%[keep_dst]]\n\t"
215 "sub %[keep_dst], %[dst], #8\n\t"
216 "subs %[w], %[w], #8\n\t"
218 "vqadd.u8 d20, d0, d4\n\t"
223 "vst1.8 {d20}, [%[keep_dst]]\n\t"
225 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
227 : "ip", "cc", "memory", "d0", "d4",
235 const uint8_t nil = 0;
236 const uint8x8_t vnil = vld1_dup_u8 (&nil);
240 uint8x8_t sval = vnil, dval = vnil;
241 uint8_t *dst4 = 0, *dst2 = 0;
244 dst_line += dst_stride;
246 src_line += src_stride;
251 sval = vreinterpret_u8_u32 (
252 vld1_lane_u32 ((void*)src, vreinterpret_u32_u8 (sval), 1));
253 dval = vreinterpret_u8_u32 (
254 vld1_lane_u32 ((void*)dst, vreinterpret_u32_u8 (dval), 1));
263 sval = vreinterpret_u8_u16 (
264 vld1_lane_u16 ((void*)src, vreinterpret_u16_u8 (sval), 1));
265 dval = vreinterpret_u8_u16 (
266 vld1_lane_u16 ((void*)dst, vreinterpret_u16_u8 (dval), 1));
275 sval = vld1_lane_u8 (src, sval, 1);
276 dval = vld1_lane_u8 (dst, dval, 1);
279 dval = vqadd_u8 (dval, sval);
282 vst1_lane_u8 (dst, dval, 1);
285 vst1_lane_u16 ((void*)dst2, vreinterpret_u16_u8 (dval), 1);
288 vst1_lane_u32 ((void*)dst4, vreinterpret_u32_u8 (dval), 1);
294 neon_composite_over_8888_8888 (pixman_implementation_t * impl,
296 pixman_image_t * src_image,
297 pixman_image_t * mask_image,
298 pixman_image_t * dst_image,
308 uint32_t *dst_line, *dst;
309 uint32_t *src_line, *src;
310 int dst_stride, src_stride;
313 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
314 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
318 /* Use overlapping 8-pixel method */
321 uint32_t *keep_dst = 0;
322 uint8x8x4_t sval, dval, temp;
325 dst_line += dst_stride;
327 src_line += src_stride;
330 #ifndef USE_GCC_INLINE_ASM
331 sval = vld4_u8 ((void*)src);
332 dval = vld4_u8 ((void*)dst);
335 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
336 temp = neon8qadd (sval, temp);
344 sval = vld4_u8 ((void*)src);
345 dval = vld4_u8 ((void*)dst);
347 vst4_u8 ((void*)keep_dst, temp);
350 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
351 temp = neon8qadd (sval, temp);
358 vst4_u8 ((void*)keep_dst, temp);
361 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
362 "vld4.8 {d0-d3}, [%[src]]\n\t"
363 "vld4.8 {d4-d7}, [%[dst]]\n\t"
364 "mov %[keep_dst], %[dst]\n\t"
366 "and ip, %[w], #7\n\t"
367 "add %[src], %[src], ip, LSL#2\n\t"
368 "add %[dst], %[dst], ip, LSL#2\n\t"
369 "subs %[w], %[w], ip\n\t"
373 "vld4.8 {d0-d3}, [%[src]]!\n\t"
374 "vld4.8 {d4-d7}, [%[dst]]!\n\t"
375 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
376 "sub %[keep_dst], %[dst], #8*4\n\t"
377 "subs %[w], %[w], #8\n\t"
380 "vmull.u8 q10, d31, d4\n\t"
381 "vmull.u8 q11, d31, d5\n\t"
382 "vmull.u8 q12, d31, d6\n\t"
383 "vmull.u8 q13, d31, d7\n\t"
384 "vrshr.u16 q8, q10, #8\n\t"
385 "vrshr.u16 q9, q11, #8\n\t"
386 "vraddhn.u16 d20, q10, q8\n\t"
387 "vraddhn.u16 d21, q11, q9\n\t"
388 "vrshr.u16 q8, q12, #8\n\t"
389 "vrshr.u16 q9, q13, #8\n\t"
390 "vraddhn.u16 d22, q12, q8\n\t"
391 "vraddhn.u16 d23, q13, q9\n\t"
392 /* result in d20-d23 */
393 "vqadd.u8 d20, d0, d20\n\t"
394 "vqadd.u8 d21, d1, d21\n\t"
395 "vqadd.u8 d22, d2, d22\n\t"
396 "vqadd.u8 d23, d3, d23\n\t"
401 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
403 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
405 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
406 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23"
413 uint8x8_t alpha_selector = vreinterpret_u8_u64 (
414 vcreate_u64 (0x0707070703030303ULL));
416 /* Handle width < 8 */
420 dst_line += dst_stride;
422 src_line += src_stride;
427 uint8x8_t sval, dval;
429 /* two 32-bit pixels packed into D-reg; ad-hoc vectorization */
430 sval = vreinterpret_u8_u32 (vld1_u32 ((void*)src));
431 dval = vreinterpret_u8_u32 (vld1_u32 ((void*)dst));
432 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
433 vst1_u8 ((void*)dst, vqadd_u8 (sval, dval));
442 uint8x8_t sval, dval;
444 /* single 32-bit pixel in lane 0 */
445 sval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)src)); /* only interested in lane 0 */
446 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)dst)); /* only interested in lane 0 */
447 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
448 vst1_lane_u32 ((void*)dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
455 neon_composite_over_8888_n_8888 (pixman_implementation_t * impl,
457 pixman_image_t * src_image,
458 pixman_image_t * mask_image,
459 pixman_image_t * dst_image,
469 uint32_t *dst_line, *dst;
470 uint32_t *src_line, *src;
472 int dst_stride, src_stride;
474 uint8x8_t mask_alpha;
476 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
477 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
479 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
480 mask_alpha = vdup_n_u8 ((mask) >> 24);
484 /* Use overlapping 8-pixel method */
488 dst_line += dst_stride;
490 src_line += src_stride;
493 uint32_t *keep_dst = 0;
495 #ifndef USE_GCC_INLINE_ASM
496 uint8x8x4_t sval, dval, temp;
498 sval = vld4_u8 ((void*)src);
499 dval = vld4_u8 ((void*)dst);
502 sval = neon8mul (sval, mask_alpha);
503 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
504 temp = neon8qadd (sval, temp);
512 sval = vld4_u8 ((void*)src);
513 dval = vld4_u8 ((void*)dst);
515 vst4_u8 ((void*)keep_dst, temp);
518 sval = neon8mul (sval, mask_alpha);
519 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
520 temp = neon8qadd (sval, temp);
526 vst4_u8 ((void*)keep_dst, temp);
529 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
530 "vdup.32 d30, %[mask]\n\t"
531 "vdup.8 d30, d30[3]\n\t"
533 "vld4.8 {d0-d3}, [%[src]]\n\t"
534 "vld4.8 {d4-d7}, [%[dst]]\n\t"
535 "mov %[keep_dst], %[dst]\n\t"
537 "and ip, %[w], #7\n\t"
538 "add %[src], %[src], ip, LSL#2\n\t"
539 "add %[dst], %[dst], ip, LSL#2\n\t"
540 "subs %[w], %[w], ip\n\t"
544 "vld4.8 {d0-d3}, [%[src]]!\n\t"
545 "vld4.8 {d4-d7}, [%[dst]]!\n\t"
546 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
547 "sub %[keep_dst], %[dst], #8*4\n\t"
548 "subs %[w], %[w], #8\n\t"
551 "vmull.u8 q10, d30, d0\n\t"
552 "vmull.u8 q11, d30, d1\n\t"
553 "vmull.u8 q12, d30, d2\n\t"
554 "vmull.u8 q13, d30, d3\n\t"
555 "vrshr.u16 q8, q10, #8\n\t"
556 "vrshr.u16 q9, q11, #8\n\t"
557 "vraddhn.u16 d0, q10, q8\n\t"
558 "vraddhn.u16 d1, q11, q9\n\t"
559 "vrshr.u16 q9, q13, #8\n\t"
560 "vrshr.u16 q8, q12, #8\n\t"
561 "vraddhn.u16 d3, q13, q9\n\t"
562 "vraddhn.u16 d2, q12, q8\n\t"
565 "vmull.u8 q10, d31, d4\n\t"
566 "vmull.u8 q11, d31, d5\n\t"
567 "vmull.u8 q12, d31, d6\n\t"
568 "vmull.u8 q13, d31, d7\n\t"
569 "vrshr.u16 q8, q10, #8\n\t"
570 "vrshr.u16 q9, q11, #8\n\t"
571 "vraddhn.u16 d20, q10, q8\n\t"
572 "vrshr.u16 q8, q12, #8\n\t"
573 "vraddhn.u16 d21, q11, q9\n\t"
574 "vrshr.u16 q9, q13, #8\n\t"
575 "vraddhn.u16 d22, q12, q8\n\t"
576 "vraddhn.u16 d23, q13, q9\n\t"
578 /* result in d20-d23 */
579 "vqadd.u8 d20, d0, d20\n\t"
580 "vqadd.u8 d21, d1, d21\n\t"
581 "vqadd.u8 d22, d2, d22\n\t"
582 "vqadd.u8 d23, d3, d23\n\t"
587 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
589 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
591 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
592 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
600 uint8x8_t alpha_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0707070703030303ULL));
602 /* Handle width < 8 */
606 dst_line += dst_stride;
608 src_line += src_stride;
613 uint8x8_t sval, dval;
615 sval = vreinterpret_u8_u32 (vld1_u32 ((void*)src));
616 dval = vreinterpret_u8_u32 (vld1_u32 ((void*)dst));
618 /* sval * const alpha_mul */
619 sval = neon2mul (sval, mask_alpha);
621 /* dval * 255-(src alpha) */
622 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
624 vst1_u8 ((void*)dst, vqadd_u8 (sval, dval));
633 uint8x8_t sval, dval;
635 sval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)src));
636 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)dst));
638 /* sval * const alpha_mul */
639 sval = neon2mul (sval, mask_alpha);
641 /* dval * 255-(src alpha) */
642 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
644 vst1_lane_u32 ((void*)dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
651 neon_composite_over_n_8_8888 (pixman_implementation_t * impl,
653 pixman_image_t * src_image,
654 pixman_image_t * mask_image,
655 pixman_image_t * dst_image,
666 uint32_t *dst_line, *dst;
667 uint8_t *mask_line, *mask;
668 int dst_stride, mask_stride;
672 uint8x8_t mask_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0101010100000000ULL));
673 uint8x8_t alpha_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0707070703030303ULL));
675 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
677 /* bail out if fully transparent */
682 sval2 = vreinterpret_u8_u32 (vdup_n_u32 (src));
683 sval8.val[0] = vdup_lane_u8 (sval2, 0);
684 sval8.val[1] = vdup_lane_u8 (sval2, 1);
685 sval8.val[2] = vdup_lane_u8 (sval2, 2);
686 sval8.val[3] = vdup_lane_u8 (sval2, 3);
688 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
689 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
693 /* Use overlapping 8-pixel method, modified to avoid
694 * rewritten dest being reused
698 uint32_t *keep_dst = 0;
701 dst_line += dst_stride;
703 mask_line += mask_stride;
706 #ifndef USE_GCC_INLINE_ASM
708 uint8x8x4_t dval, temp;
710 alpha = vld1_u8 ((void*)mask);
711 dval = vld4_u8 ((void*)dst);
714 temp = neon8mul (sval8, alpha);
715 dval = neon8mul (dval, vmvn_u8 (temp.val[3]));
716 temp = neon8qadd (temp, dval);
724 alpha = vld1_u8 ((void*)mask);
725 dval = vld4_u8 ((void*)dst);
727 vst4_u8 ((void*)keep_dst, temp);
730 temp = neon8mul (sval8, alpha);
731 dval = neon8mul (dval, vmvn_u8 (temp.val[3]));
732 temp = neon8qadd (temp, dval);
738 vst4_u8 ((void*)keep_dst, temp);
741 "vdup.32 d0, %[src]\n\t"
742 "vdup.8 d1, d0[1]\n\t"
743 "vdup.8 d2, d0[2]\n\t"
744 "vdup.8 d3, d0[3]\n\t"
745 "vdup.8 d0, d0[0]\n\t"
747 "vld4.8 {d4-d7}, [%[dst]]\n\t"
748 "vld1.8 {d31}, [%[mask]]\n\t"
749 "mov %[keep_dst], %[dst]\n\t"
751 "and ip, %[w], #7\n\t"
752 "add %[mask], %[mask], ip\n\t"
753 "add %[dst], %[dst], ip, LSL#2\n\t"
754 "subs %[w], %[w], ip\n\t"
758 "vld4.8 {d4-d7}, [%[dst]]!\n\t"
759 "vld1.8 {d31}, [%[mask]]!\n\t"
760 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
761 "sub %[keep_dst], %[dst], #8*4\n\t"
762 "subs %[w], %[w], #8\n\t"
765 "vmull.u8 q10, d31, d0\n\t"
766 "vmull.u8 q11, d31, d1\n\t"
767 "vmull.u8 q12, d31, d2\n\t"
768 "vmull.u8 q13, d31, d3\n\t"
769 "vrshr.u16 q8, q10, #8\n\t"
770 "vrshr.u16 q9, q11, #8\n\t"
771 "vraddhn.u16 d20, q10, q8\n\t"
772 "vraddhn.u16 d21, q11, q9\n\t"
773 "vrshr.u16 q9, q13, #8\n\t"
774 "vrshr.u16 q8, q12, #8\n\t"
775 "vraddhn.u16 d23, q13, q9\n\t"
776 "vraddhn.u16 d22, q12, q8\n\t"
778 "vmvn.8 d30, d23\n\t"
779 "vmull.u8 q12, d30, d4\n\t"
780 "vmull.u8 q13, d30, d5\n\t"
781 "vmull.u8 q14, d30, d6\n\t"
782 "vmull.u8 q15, d30, d7\n\t"
784 "vrshr.u16 q8, q12, #8\n\t"
785 "vrshr.u16 q9, q13, #8\n\t"
786 "vraddhn.u16 d4, q12, q8\n\t"
787 "vrshr.u16 q8, q14, #8\n\t"
788 "vraddhn.u16 d5, q13, q9\n\t"
789 "vrshr.u16 q9, q15, #8\n\t"
790 "vraddhn.u16 d6, q14, q8\n\t"
791 "vraddhn.u16 d7, q15, q9\n\t"
792 /* result in d4-d7 */
794 "vqadd.u8 d20, d4, d20\n\t"
795 "vqadd.u8 d21, d5, d21\n\t"
796 "vqadd.u8 d22, d6, d22\n\t"
797 "vqadd.u8 d23, d7, d23\n\t"
802 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
804 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
806 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
807 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
820 dst_line += dst_stride;
822 mask_line += mask_stride;
827 uint8x8_t dval, temp, res;
830 vreinterpret_u8_u16 (vld1_dup_u16 ((void*)mask)), mask_selector);
831 dval = vld1_u8 ((void*)dst);
833 temp = neon2mul (sval2, alpha);
835 temp, neon2mul (dval, vtbl1_u8 (vmvn_u8 (temp), alpha_selector)));
837 vst1_u8 ((void*)dst, res);
846 uint8x8_t dval, temp, res;
848 alpha = vtbl1_u8 (vld1_dup_u8 ((void*)mask), mask_selector);
849 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)dst));
851 temp = neon2mul (sval2, alpha);
853 temp, neon2mul (dval, vtbl1_u8 (vmvn_u8 (temp), alpha_selector)));
855 vst1_lane_u32 ((void*)dst, vreinterpret_u32_u8 (res), 0);
862 neon_composite_add_8888_8_8 (pixman_implementation_t * impl,
864 pixman_image_t * src_image,
865 pixman_image_t * mask_image,
866 pixman_image_t * dst_image,
876 uint8_t *dst_line, *dst;
877 uint8_t *mask_line, *mask;
878 int dst_stride, mask_stride;
883 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
884 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
885 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
886 sa = vdup_n_u8 ((src) >> 24);
890 /* Use overlapping 8-pixel method, modified to avoid rewritten dest being reused */
894 dst_line += dst_stride;
896 mask_line += mask_stride;
899 uint8x8_t mval, dval, res;
902 mval = vld1_u8 ((void *)mask);
903 dval = vld1_u8 ((void *)dst);
906 res = vqadd_u8 (neon2mul (mval, sa), dval);
914 mval = vld1_u8 ((void *)mask);
915 dval = vld1_u8 ((void *)dst);
916 vst1_u8 ((void *)keep_dst, res);
919 res = vqadd_u8 (neon2mul (mval, sa), dval);
925 vst1_u8 ((void *)keep_dst, res);
930 /* Use 4/2/1 load/store method to handle 1-7 pixels */
934 dst_line += dst_stride;
936 mask_line += mask_stride;
939 uint8x8_t mval = sa, dval = sa, res;
940 uint8_t *dst4 = 0, *dst2 = 0;
944 mval = vreinterpret_u8_u32 (
945 vld1_lane_u32 ((void *)mask, vreinterpret_u32_u8 (mval), 1));
946 dval = vreinterpret_u8_u32 (
947 vld1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (dval), 1));
956 mval = vreinterpret_u8_u16 (
957 vld1_lane_u16 ((void *)mask, vreinterpret_u16_u8 (mval), 1));
958 dval = vreinterpret_u8_u16 (
959 vld1_lane_u16 ((void *)dst, vreinterpret_u16_u8 (dval), 1));
967 mval = vld1_lane_u8 (mask, mval, 1);
968 dval = vld1_lane_u8 (dst, dval, 1);
971 res = vqadd_u8 (neon2mul (mval, sa), dval);
974 vst1_lane_u8 (dst, res, 1);
976 vst1_lane_u16 ((void *)dst2, vreinterpret_u16_u8 (res), 1);
978 vst1_lane_u32 ((void *)dst4, vreinterpret_u32_u8 (res), 1);
983 #ifdef USE_GCC_INLINE_ASM
986 neon_composite_src_16_16 (pixman_implementation_t * impl,
988 pixman_image_t * src_image,
989 pixman_image_t * mask_image,
990 pixman_image_t * dst_image,
1000 uint16_t *dst_line, *src_line;
1001 uint32_t dst_stride, src_stride;
1003 if (!height || !width)
1006 /* We simply copy 16-bit-aligned pixels from one place to another. */
1007 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
1008 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1010 /* Preload the first input scanline */
1012 uint16_t *src_ptr = src_line;
1013 uint32_t count = width;
1017 " subs %[count], %[count], #32 \n"
1019 " add %[src], %[src], #64 \n"
1022 /* Clobbered input registers marked as input/outputs */
1023 : [src] "+r" (src_ptr), [count] "+r" (count)
1024 : /* no unclobbered inputs */
1031 uint16_t *dst_ptr = dst_line;
1032 uint16_t *src_ptr = src_line;
1033 uint32_t count = width;
1036 /* Uses multi-register access and preloading to maximise bandwidth.
1037 * Each pixel is one halfword, so a quadword contains 8px.
1038 * Preload frequency assumed a 64-byte cacheline.
1041 " cmp %[count], #64 \n"
1042 " blt 1f @ skip oversized fragments \n"
1043 "0: @ start with eight quadwords at a time \n"
1044 /* preload from next scanline */
1045 " pld [%[src], %[src_stride], LSL #1] \n"
1046 " sub %[count], %[count], #64 \n"
1047 " vld1.16 {d16,d17,d18,d19}, [%[src]]! \n"
1048 " vld1.16 {d20,d21,d22,d23}, [%[src]]! \n"
1049 /* preload from next scanline */
1050 " pld [%[src], %[src_stride], LSL #1] \n"
1051 " vld1.16 {d24,d25,d26,d27}, [%[src]]! \n"
1052 " vld1.16 {d28,d29,d30,d31}, [%[src]]! \n"
1053 " cmp %[count], #64 \n"
1054 " vst1.16 {d16,d17,d18,d19}, [%[dst]]! \n"
1055 " vst1.16 {d20,d21,d22,d23}, [%[dst]]! \n"
1056 " vst1.16 {d24,d25,d26,d27}, [%[dst]]! \n"
1057 " vst1.16 {d28,d29,d30,d31}, [%[dst]]! \n"
1059 " cmp %[count], #0 \n"
1060 " beq 7f @ aligned fastpath \n"
1061 "1: @ four quadwords \n"
1062 " tst %[count], #32 \n"
1063 " beq 2f @ skip oversized fragment \n"
1064 /* preload from next scanline */
1065 " pld [%[src], %[src_stride], LSL #1] \n"
1066 " vld1.16 {d16,d17,d18,d19}, [%[src]]! \n"
1067 " vld1.16 {d20,d21,d22,d23}, [%[src]]! \n"
1068 " vst1.16 {d16,d17,d18,d19}, [%[dst]]! \n"
1069 " vst1.16 {d20,d21,d22,d23}, [%[dst]]! \n"
1070 "2: @ two quadwords \n"
1071 " tst %[count], #16 \n"
1072 " beq 3f @ skip oversized fragment \n"
1073 /* preload from next scanline */
1074 " pld [%[src], %[src_stride], LSL #1] \n"
1075 " vld1.16 {d16,d17,d18,d19}, [%[src]]! \n"
1076 " vst1.16 {d16,d17,d18,d19}, [%[dst]]! \n"
1077 "3: @ one quadword \n"
1078 " tst %[count], #8 \n"
1079 " beq 4f @ skip oversized fragment \n"
1080 " vld1.16 {d16,d17}, [%[src]]! \n"
1081 " vst1.16 {d16,d17}, [%[dst]]! \n"
1082 "4: @ one doubleword \n"
1083 " tst %[count], #4 \n"
1084 " beq 5f @ skip oversized fragment \n"
1085 " vld1.16 {d16}, [%[src]]! \n"
1086 " vst1.16 {d16}, [%[dst]]! \n"
1088 " tst %[count], #2 \n"
1089 " beq 6f @ skip oversized fragment \n"
1090 " ldr %[tmp], [%[src]], #4 \n"
1091 " str %[tmp], [%[dst]], #4 \n"
1092 "6: @ one halfword \n"
1093 " tst %[count], #1 \n"
1094 " beq 7f @ skip oversized fragment \n"
1095 " ldrh %[tmp], [%[src]] \n"
1096 " strh %[tmp], [%[dst]] \n"
1099 /* Clobbered input registers marked as input/outputs */
1100 : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr),
1101 [count] "+r" (count), [tmp] "+r" (tmp)
1103 /* Unclobbered input */
1104 : [src_stride] "r" (src_stride)
1106 /* Clobbered vector registers */
1108 /* NB: these are the quad aliases of the double
1109 * registers used in the asm
1111 : "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory"
1114 src_line += src_stride;
1115 dst_line += dst_stride;
1119 #endif /* USE_GCC_INLINE_ASM */
1122 neon_composite_src_24_16 (pixman_implementation_t * impl,
1124 pixman_image_t * src_image,
1125 pixman_image_t * mask_image,
1126 pixman_image_t * dst_image,
1138 uint32_t dst_stride, src_stride;
1140 if (!width || !height)
1143 /* We simply copy pixels from one place to another,
1144 * assuming that the source's alpha is opaque.
1146 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1147 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1149 /* Preload the first input scanline */
1151 uint8_t *src_ptr = (uint8_t*) src_line;
1152 uint32_t count = (width + 15) / 16;
1154 #ifdef USE_GCC_INLINE_ASM
1157 " subs %[count], %[count], #1 \n"
1159 " add %[src], %[src], #64 \n"
1162 /* Clobbered input registers marked as input/outputs */
1163 : [src] "+r" (src_ptr), [count] "+r" (count)
1164 : /* no unclobbered inputs */
1179 uint16_t *dst_ptr = dst_line;
1180 uint32_t *src_ptr = src_line;
1181 uint32_t count = width;
1182 const uint32_t rb_mask = 0x1F;
1183 const uint32_t g_mask = 0x3F;
1185 /* If you're going to complain about a goto, take a long hard look
1186 * at the massive blocks of assembler this skips over. ;-)
1191 #ifdef USE_GCC_INLINE_ASM
1193 /* This is not as aggressive as the RGB565-source case.
1194 * Generally the source is in cached RAM when the formats are
1195 * different, so we use preload.
1197 * We don't need to blend, so we are not reading from the
1198 * uncached framebuffer.
1201 " cmp %[count], #16 \n"
1202 " blt 1f @ skip oversized fragments \n"
1203 "0: @ start with sixteen pixels at a time \n"
1204 " sub %[count], %[count], #16 \n"
1205 " pld [%[src], %[src_stride], lsl #2] @ preload from next scanline \n"
1206 " vld4.8 {d0,d1,d2,d3}, [%[src]]! @ d3 is alpha and ignored, d2-0 are rgb. \n"
1207 " vld4.8 {d4,d5,d6,d7}, [%[src]]! @ d7 is alpha and ignored, d6-4 are rgb. \n"
1208 " vshll.u8 q8, d2, #8 @ expand first red for repacking \n"
1209 " vshll.u8 q10, d1, #8 @ expand first green for repacking \n"
1210 " vshll.u8 q11, d0, #8 @ expand first blue for repacking \n"
1211 " vshll.u8 q9, d6, #8 @ expand second red for repacking \n"
1212 " vsri.u16 q8, q10, #5 @ insert first green after red \n"
1213 " vshll.u8 q10, d5, #8 @ expand second green for repacking \n"
1214 " vsri.u16 q8, q11, #11 @ insert first blue after green \n"
1215 " vshll.u8 q11, d4, #8 @ expand second blue for repacking \n"
1216 " vsri.u16 q9, q10, #5 @ insert second green after red \n"
1217 " vsri.u16 q9, q11, #11 @ insert second blue after green \n"
1218 " cmp %[count], #16 \n"
1219 " vst1.16 {d16,d17,d18,d19}, [%[dst]]! @ store 16 pixels \n"
1221 "1: @ end of main loop \n"
1222 " cmp %[count], #8 @ can we still do an 8-pixel block? \n"
1224 " sub %[count], %[count], #8 \n"
1225 " pld [%[src], %[src_stride], lsl #2] @ preload from next scanline \n"
1226 " vld4.8 {d0,d1,d2,d3}, [%[src]]! @ d3 is alpha and ignored, d2-0 are rgb. \n"
1227 " vshll.u8 q8, d2, #8 @ expand first red for repacking \n"
1228 " vshll.u8 q10, d1, #8 @ expand first green for repacking \n"
1229 " vshll.u8 q11, d0, #8 @ expand first blue for repacking \n"
1230 " vsri.u16 q8, q10, #5 @ insert first green after red \n"
1231 " vsri.u16 q8, q11, #11 @ insert first blue after green \n"
1232 " vst1.16 {d16,d17}, [%[dst]]! @ store 8 pixels \n"
1235 /* Clobbered input and working registers marked as input/outputs */
1236 : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr), [count] "+r" (count)
1238 /* Unclobbered input */
1239 : [src_stride] "r" (src_stride)
1241 /* Clobbered vector registers */
1243 /* NB: these are the quad aliases of the
1244 * double registers used in the asm
1246 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "cc", "memory"
1249 /* A copy of the above code, in intrinsics-form. */
1252 uint8x8x4_t pixel_set_a, pixel_set_b;
1253 uint16x8_t red_a, green_a, blue_a;
1254 uint16x8_t red_b, green_b, blue_b;
1255 uint16x8_t dest_pixels_a, dest_pixels_b;
1258 __pld (src_ptr + src_stride);
1259 pixel_set_a = vld4_u8 ((uint8_t*)(src_ptr));
1260 pixel_set_b = vld4_u8 ((uint8_t*)(src_ptr + 8));
1263 red_a = vshll_n_u8 (pixel_set_a.val[2], 8);
1264 green_a = vshll_n_u8 (pixel_set_a.val[1], 8);
1265 blue_a = vshll_n_u8 (pixel_set_a.val[0], 8);
1267 red_b = vshll_n_u8 (pixel_set_b.val[2], 8);
1268 green_b = vshll_n_u8 (pixel_set_b.val[1], 8);
1269 blue_b = vshll_n_u8 (pixel_set_b.val[0], 8);
1271 dest_pixels_a = vsriq_n_u16 (red_a, green_a, 5);
1272 dest_pixels_b = vsriq_n_u16 (red_b, green_b, 5);
1274 dest_pixels_a = vsriq_n_u16 (dest_pixels_a, blue_a, 11);
1275 dest_pixels_b = vsriq_n_u16 (dest_pixels_b, blue_b, 11);
1277 /* There doesn't seem to be an intrinsic for the
1278 * double-quadword variant
1280 vst1q_u16 (dst_ptr, dest_pixels_a);
1281 vst1q_u16 (dst_ptr + 8, dest_pixels_b);
1288 uint8x8x4_t pixel_set_a;
1289 uint16x8_t red_a, green_a, blue_a;
1290 uint16x8_t dest_pixels_a;
1292 __pld (src_ptr + src_stride);
1294 pixel_set_a = vld4_u8 ((uint8_t*)(src_ptr));
1297 red_a = vshll_n_u8 (pixel_set_a.val[2], 8);
1298 green_a = vshll_n_u8 (pixel_set_a.val[1], 8);
1299 blue_a = vshll_n_u8 (pixel_set_a.val[0], 8);
1301 dest_pixels_a = vsriq_n_u16 (red_a, green_a, 5);
1302 dest_pixels_a = vsriq_n_u16 (dest_pixels_a, blue_a, 11);
1304 vst1q_u16 (dst_ptr, dest_pixels_a);
1308 #endif /* USE_GCC_INLINE_ASM */
1312 __pld (src_ptr + src_stride);
1316 uint32_t src_pixel_a = *src_ptr++;
1317 uint32_t src_pixel_b = *src_ptr++;
1319 /* ARM is really good at shift-then-ALU ops. */
1320 /* This should be a total of six shift-ANDs and five shift-ORs. */
1321 uint32_t dst_pixels_a;
1322 uint32_t dst_pixels_b;
1324 dst_pixels_a = ((src_pixel_a >> 3) & rb_mask);
1325 dst_pixels_a |= ((src_pixel_a >> 10) & g_mask) << 5;
1326 dst_pixels_a |= ((src_pixel_a >> 19) & rb_mask) << 11;
1328 dst_pixels_b = ((src_pixel_b >> 3) & rb_mask);
1329 dst_pixels_b |= ((src_pixel_b >> 10) & g_mask) << 5;
1330 dst_pixels_b |= ((src_pixel_b >> 19) & rb_mask) << 11;
1332 /* little-endian mode only */
1333 *((uint32_t*) dst_ptr) = dst_pixels_a | (dst_pixels_b << 16);
1340 uint32_t src_pixel = *src_ptr++;
1342 /* ARM is really good at shift-then-ALU ops.
1343 * This block should end up as three shift-ANDs
1344 * and two shift-ORs.
1346 uint32_t tmp_blue = (src_pixel >> 3) & rb_mask;
1347 uint32_t tmp_green = (src_pixel >> 10) & g_mask;
1348 uint32_t tmp_red = (src_pixel >> 19) & rb_mask;
1349 uint16_t dst_pixel = (tmp_red << 11) | (tmp_green << 5) | tmp_blue;
1351 *dst_ptr++ = dst_pixel;
1355 src_line += src_stride;
1356 dst_line += dst_stride;
1360 static pixman_bool_t
1361 pixman_fill_neon (uint32_t *bits,
1370 uint32_t byte_stride, color;
1373 /* stride is always multiple of 32bit units in pixman */
1374 byte_stride = stride * sizeof(uint32_t);
1379 dst = ((char *) bits) + y * byte_stride + x;
1381 color = _xor << 24 | _xor << 16 | _xor << 8 | _xor;
1385 dst = ((char *) bits) + y * byte_stride + x * 2;
1387 color = _xor << 16 | _xor;
1388 width *= 2; /* width to bytes */
1392 dst = ((char *) bits) + y * byte_stride + x * 4;
1394 width *= 4; /* width to bytes */
1401 #ifdef USE_GCC_INLINE_ASM
1404 /* We have a special case for such small widths that don't allow
1405 * us to use wide 128-bit stores anyway. We don't waste time
1406 * trying to align writes, since there are only very few of them anyway
1409 "cmp %[height], #0\n"/* Check if empty fill */
1411 "vdup.32 d0, %[color]\n"/* Fill the color to neon req */
1413 /* Check if we have a such width that can easily be handled by single
1414 * operation for each scanline. This significantly reduces the number
1415 * of test/branch instructions for each scanline
1417 "cmp %[width], #8\n"
1419 "cmp %[width], #4\n"
1421 "cmp %[width], #2\n"
1424 /* Loop starts here for each scanline */
1426 "mov r4, %[dst]\n" /* Starting address of the current line */
1427 "tst %[width], #8\n"
1429 "vst1.8 {d0}, [r4]!\n"
1431 "tst %[width], #4\n"
1433 "str %[color], [r4], #4\n"
1435 "tst %[width], #2\n"
1437 "strh %[color], [r4], #2\n"
1439 "tst %[width], #1\n"
1441 "strb %[color], [r4], #1\n"
1444 "subs %[height], %[height], #1\n"
1445 "add %[dst], %[dst], %[byte_stride]\n"
1449 /* Special fillers for those widths that we can do with single operation */
1451 "subs %[height], %[height], #1\n"
1452 "vst1.8 {d0}, [%[dst]]\n"
1453 "add %[dst], %[dst], %[byte_stride]\n"
1458 "subs %[height], %[height], #1\n"
1459 "str %[color], [%[dst]]\n"
1460 "add %[dst], %[dst], %[byte_stride]\n"
1465 "subs %[height], %[height], #1\n"
1466 "strh %[color], [%[dst]]\n"
1467 "add %[dst], %[dst], %[byte_stride]\n"
1472 : /* No output members */
1473 : [color] "r" (color), [height] "r" (height), [width] "r" (width),
1474 [dst] "r" (dst), [byte_stride] "r" (byte_stride)
1475 : "memory", "cc", "d0", "r4", "r5");
1480 "cmp %[height], #0\n"/* Check if empty fill */
1482 "vdup.32 q0, %[color]\n"/* Fill the color to neon req */
1484 /* Loop starts here for each scanline */
1486 "mov r4, %[dst]\n"/* Starting address of the current line */
1487 "mov r5, %[width]\n"/* We're going to write this many bytes */
1488 "ands r6, r4, #15\n"/* Are we at the 128-bit aligned address? */
1489 "beq 2f\n"/* Jump to the best case */
1491 /* We're not 128-bit aligned: However, we know that we can get to the
1492 next aligned location, since the fill is at least 16 bytes wide */
1493 "rsb r6, r6, #16\n" /* We would need to go forward this much */
1494 "sub r5, r5, r6\n"/* Update bytes left */
1497 "vst1.8 {d0[0]}, [r4]!\n"/* Store byte, now we are word aligned */
1501 "vst1.16 {d0[0]}, [r4, :16]!\n"/* Store half word, now we are 16-bit aligned */
1505 "vst1.32 {d0[0]}, [r4, :32]!\n"/* Store word, now we're 32-bit aligned */
1509 "vst1.64 {d0}, [r4, :64]!\n"/* Store qword now we're 64-bit aligned */
1511 /* The good case: We're 128-bit aligned for this scanline */
1513 "and r6, r5, #15\n"/* Number of tailing bytes */
1514 "cmp r5, r6\n"/* Do we have at least one qword to write? */
1515 "beq 6f\n"/* No, we just write the tail */
1516 "lsr r5, r5, #4\n"/* This many full qwords to write */
1518 /* The main block: Do 128-bit aligned writes */
1521 "vst1.64 {d0,d1}, [r4, :128]!\n"
1524 /* Handle the tailing bytes: Do 64, 32, 16 and 8-bit aligned writes as needed.
1525 We know that we're currently at 128-bit aligned address, so we can just
1526 pick the biggest operations that the remaining write width allows */
1532 "vst1.64 {d0}, [r4, :64]!\n"
1536 "vst1.32 {d0[0]}, [r4, :32]!\n"
1540 "vst1.16 {d0[0]}, [r4, :16]!\n"
1544 "vst1.8 {d0[0]}, [r4]!\n"
1547 /* Handle the next scanline */
1548 "subs %[height], %[height], #1\n"
1549 "add %[dst], %[dst], %[byte_stride]\n"
1552 : /* No output members */
1553 : [color] "r" (color), [height] "r" (height), [width] "r" (width),
1554 [dst] "r" (dst), [byte_stride] "r" (byte_stride)
1555 : "memory", "cc", "q0", "d0", "d1", "r4", "r5", "r6");
1561 /* TODO: intrinsic version for armcc */
1567 /* TODO: is there a more generic way of doing this being introduced? */
1568 #define NEON_SCANLINE_BUFFER_PIXELS (1024)
1571 neon_quadword_copy (void* dst,
1573 uint32_t count, /* of quadwords */
1574 uint32_t trailer_count /* of bytes */)
1576 uint8_t *t_dst = dst, *t_src = src;
1578 /* Uses aligned multi-register loads to maximise read bandwidth
1579 * on uncached memory such as framebuffers
1580 * The accesses do not have the aligned qualifiers, so that the copy
1581 * may convert between aligned-uncached and unaligned-cached memory.
1582 * It is assumed that the CPU can infer alignedness from the address.
1585 #ifdef USE_GCC_INLINE_ASM
1588 " cmp %[count], #8 \n"
1589 " blt 1f @ skip oversized fragments \n"
1590 "0: @ start with eight quadwords at a time \n"
1591 " sub %[count], %[count], #8 \n"
1592 " vld1.8 {d16,d17,d18,d19}, [%[src]]! \n"
1593 " vld1.8 {d20,d21,d22,d23}, [%[src]]! \n"
1594 " vld1.8 {d24,d25,d26,d27}, [%[src]]! \n"
1595 " vld1.8 {d28,d29,d30,d31}, [%[src]]! \n"
1596 " cmp %[count], #8 \n"
1597 " vst1.8 {d16,d17,d18,d19}, [%[dst]]! \n"
1598 " vst1.8 {d20,d21,d22,d23}, [%[dst]]! \n"
1599 " vst1.8 {d24,d25,d26,d27}, [%[dst]]! \n"
1600 " vst1.8 {d28,d29,d30,d31}, [%[dst]]! \n"
1602 "1: @ four quadwords \n"
1603 " tst %[count], #4 \n"
1604 " beq 2f @ skip oversized fragment \n"
1605 " vld1.8 {d16,d17,d18,d19}, [%[src]]! \n"
1606 " vld1.8 {d20,d21,d22,d23}, [%[src]]! \n"
1607 " vst1.8 {d16,d17,d18,d19}, [%[dst]]! \n"
1608 " vst1.8 {d20,d21,d22,d23}, [%[dst]]! \n"
1609 "2: @ two quadwords \n"
1610 " tst %[count], #2 \n"
1611 " beq 3f @ skip oversized fragment \n"
1612 " vld1.8 {d16,d17,d18,d19}, [%[src]]! \n"
1613 " vst1.8 {d16,d17,d18,d19}, [%[dst]]! \n"
1614 "3: @ one quadword \n"
1615 " tst %[count], #1 \n"
1616 " beq 4f @ skip oversized fragment \n"
1617 " vld1.8 {d16,d17}, [%[src]]! \n"
1618 " vst1.8 {d16,d17}, [%[dst]]! \n"
1621 /* Clobbered input registers marked as input/outputs */
1622 : [dst] "+r" (t_dst), [src] "+r" (t_src), [count] "+r" (count)
1624 /* No unclobbered inputs */
1627 /* Clobbered vector registers */
1628 /* NB: these are the quad aliases of the double
1629 * registers used in the asm
1631 : "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory");
1637 uint8x16x4_t t1 = vld4q_u8 (t_src);
1638 uint8x16x4_t t2 = vld4q_u8 (t_src + sizeof(uint8x16x4_t));
1640 t_src += sizeof(uint8x16x4_t) * 2;
1641 vst4q_u8 (t_dst, t1);
1642 vst4q_u8 (t_dst + sizeof(uint8x16x4_t), t2);
1643 t_dst += sizeof(uint8x16x4_t) * 2;
1649 uint8x16x4_t t1 = vld4q_u8 (t_src);
1651 t_src += sizeof(uint8x16x4_t);
1652 vst4q_u8 (t_dst, t1);
1653 t_dst += sizeof(uint8x16x4_t);
1658 uint8x8x4_t t1 = vld4_u8 (t_src);
1660 t_src += sizeof(uint8x8x4_t);
1661 vst4_u8 (t_dst, t1);
1662 t_dst += sizeof(uint8x8x4_t);
1667 uint8x16_t t1 = vld1q_u8 (t_src);
1669 t_src += sizeof(uint8x16_t);
1670 vst1q_u8 (t_dst, t1);
1671 t_dst += sizeof(uint8x16_t);
1674 #endif /* !USE_GCC_INLINE_ASM */
1678 if (trailer_count & 8)
1680 uint8x8_t t1 = vld1_u8 (t_src);
1682 t_src += sizeof(uint8x8_t);
1683 vst1_u8 (t_dst, t1);
1684 t_dst += sizeof(uint8x8_t);
1687 if (trailer_count & 4)
1689 *((uint32_t*) t_dst) = *((uint32_t*) t_src);
1695 if (trailer_count & 2)
1697 *((uint16_t*) t_dst) = *((uint16_t*) t_src);
1703 if (trailer_count & 1)
1705 *t_dst++ = *t_src++;
1711 solid_over_565_8_pix_neon (uint32_t glyph_colour,
1714 uint32_t dest_stride, /* bytes, not elements */
1715 uint32_t mask_stride,
1716 uint32_t count /* 8-pixel groups */)
1718 /* Inner loop of glyph blitter (solid colour, alpha mask) */
1720 #ifdef USE_GCC_INLINE_ASM
1723 " vld4.8 {d20[],d21[],d22[],d23[]}, [%[glyph_colour]] @ splat solid colour components \n"
1725 " vld1.16 {d0,d1}, [%[dest]] @ load first pixels from framebuffer \n"
1726 " vld1.8 {d17}, [%[in_mask]] @ load alpha mask of glyph \n"
1727 " vmull.u8 q9, d17, d23 @ apply glyph colour alpha to mask \n"
1728 " vshrn.u16 d17, q9, #8 @ reformat it to match original mask \n"
1729 " vmvn d18, d17 @ we need the inverse mask for the background \n"
1730 " vsli.u16 q3, q0, #5 @ duplicate framebuffer blue bits \n"
1731 " vshrn.u16 d2, q0, #8 @ unpack red from framebuffer pixels \n"
1732 " vshrn.u16 d4, q0, #3 @ unpack green \n"
1733 " vsri.u8 d2, d2, #5 @ duplicate red bits (extend 5 to 8) \n"
1734 " vshrn.u16 d6, q3, #2 @ unpack extended blue (truncate 10 to 8) \n"
1735 " vsri.u8 d4, d4, #6 @ duplicate green bits (extend 6 to 8) \n"
1736 " vmull.u8 q1, d2, d18 @ apply inverse mask to background red... \n"
1737 " vmull.u8 q2, d4, d18 @ ...green... \n"
1738 " vmull.u8 q3, d6, d18 @ ...blue \n"
1739 " subs %[count], %[count], #1 @ decrement/test loop counter \n"
1740 " vmlal.u8 q1, d17, d22 @ add masked foreground red... \n"
1741 " vmlal.u8 q2, d17, d21 @ ...green... \n"
1742 " vmlal.u8 q3, d17, d20 @ ...blue \n"
1743 " add %[in_mask], %[in_mask], %[mask_stride] @ advance mask pointer, while we wait \n"
1744 " vsri.16 q1, q2, #5 @ pack green behind red \n"
1745 " vsri.16 q1, q3, #11 @ pack blue into pixels \n"
1746 " vst1.16 {d2,d3}, [%[dest]] @ store composited pixels \n"
1747 " add %[dest], %[dest], %[dest_stride] @ advance framebuffer pointer \n"
1748 " bne 0b @ next please \n"
1750 /* Clobbered registers marked as input/outputs */
1751 : [dest] "+r" (dest), [in_mask] "+r" (in_mask), [count] "+r" (count)
1754 : [dest_stride] "r" (dest_stride), [mask_stride] "r" (mask_stride), [glyph_colour] "r" (&glyph_colour)
1756 /* Clobbers, including the inputs we modify, and potentially lots of memory */
1757 : "q0", "q1", "q2", "q3", "d17", "q9", "q10", "q11", "q12", "cc", "memory"
1762 uint8x8x4_t solid_colour = vld4_dup_u8 ((uint8_t*) &glyph_colour);
1766 uint16x8_t pixels = vld1q_u16 (dest);
1767 uint8x8_t mask = vshrn_n_u16 (vmull_u8 (solid_colour.val[3], vld1_u8 (in_mask)), 8);
1768 uint8x8_t mask_image = vmvn_u8 (mask);
1770 uint8x8_t t_red = vshrn_n_u16 (pixels, 8);
1771 uint8x8_t t_green = vshrn_n_u16 (pixels, 3);
1772 uint8x8_t t_blue = vshrn_n_u16 (vsli_n_u8 (pixels, pixels, 5), 2);
1774 uint16x8_t s_red = vmull_u8 (vsri_n_u8 (t_red, t_red, 5), mask_image);
1775 uint16x8_t s_green = vmull_u8 (vsri_n_u8 (t_green, t_green, 6), mask_image);
1776 uint16x8_t s_blue = vmull_u8 (t_blue, mask_image);
1778 s_red = vmlal (s_red, mask, solid_colour.val[2]);
1779 s_green = vmlal (s_green, mask, solid_colour.val[1]);
1780 s_blue = vmlal (s_blue, mask, solid_colour.val[0]);
1782 pixels = vsri_n_u16 (s_red, s_green, 5);
1783 pixels = vsri_n_u16 (pixels, s_blue, 11);
1784 vst1q_u16 (dest, pixels);
1786 dest += dest_stride;
1787 mask += mask_stride;
1794 neon_composite_over_n_8_0565 (pixman_implementation_t * impl,
1796 pixman_image_t * src_image,
1797 pixman_image_t * mask_image,
1798 pixman_image_t * dst_image,
1809 uint16_t *dst_line, *aligned_line;
1811 uint32_t dst_stride, mask_stride;
1812 uint32_t kernel_count, copy_count, copy_tail;
1813 uint8_t kernel_offset, copy_offset;
1815 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1817 /* bail out if fully transparent or degenerate */
1822 if (width == 0 || height == 0)
1825 if (width > NEON_SCANLINE_BUFFER_PIXELS)
1827 /* split the blit, so we can use a fixed-size scanline buffer
1828 * TODO: there must be a more elegant way of doing this.
1831 for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
1833 neon_composite_over_n_8_0565 (
1835 src_image, mask_image, dst_image,
1836 src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
1837 (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
1843 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1844 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1846 /* keep within minimum number of aligned quadwords on width
1847 * while also keeping the minimum number of columns to process
1850 unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
1851 unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
1852 unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
1854 /* the fast copy should be quadword aligned */
1855 copy_offset = dst_line - ((uint16_t*) aligned_left);
1856 aligned_line = dst_line - copy_offset;
1857 copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
1860 if (aligned_right - aligned_left > ceiling_length)
1862 /* unaligned routine is tightest */
1863 kernel_count = (uint32_t) (ceiling_length >> 4);
1864 kernel_offset = copy_offset;
1868 /* aligned routine is equally tight, so it is safer to align */
1869 kernel_count = copy_count;
1873 /* We should avoid reading beyond scanline ends for safety */
1874 if (aligned_line < (dst_line - dest_x) ||
1875 (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
1877 /* switch to precise read */
1878 copy_offset = kernel_offset = 0;
1879 aligned_line = dst_line;
1880 kernel_count = (uint32_t) (ceiling_length >> 4);
1881 copy_count = (width * sizeof(*dst_line)) >> 4;
1882 copy_tail = (width * sizeof(*dst_line)) & 0xF;
1887 uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8]; /* deliberately not initialised */
1888 uint8_t glyph_line[NEON_SCANLINE_BUFFER_PIXELS + 8];
1891 /* row-major order */
1892 /* left edge, middle block, right edge */
1893 for ( ; y--; mask_line += mask_stride, aligned_line += dst_stride, dst_line += dst_stride)
1895 /* We don't want to overrun the edges of the glyph,
1896 * so realign the edge data into known buffers
1898 neon_quadword_copy (glyph_line + copy_offset, mask_line, width >> 4, width & 0xF);
1900 /* Uncached framebuffer access is really, really slow
1901 * if we do it piecemeal. It should be much faster if we
1902 * grab it all at once. One scanline should easily fit in
1903 * L1 cache, so this should not waste RAM bandwidth.
1905 neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
1907 /* Apply the actual filter */
1908 solid_over_565_8_pix_neon (
1909 src, scan_line + kernel_offset,
1910 glyph_line + kernel_offset, 8 * sizeof(*dst_line),
1913 /* Copy the modified scanline back */
1914 neon_quadword_copy (dst_line, scan_line + copy_offset,
1915 width >> 3, (width & 7) * 2);
1920 #ifdef USE_GCC_INLINE_ASM
1923 plain_over_565_8_pix_neon (uint32_t colour,
1925 uint32_t dest_stride, /* bytes, not elements */
1926 uint32_t count /* 8-pixel groups */)
1928 /* Inner loop for plain translucent rects
1929 * (solid colour without alpha mask)
1932 " vld4.8 {d20[],d21[],d22[],d23[]}, [%[colour]] @ solid colour load/splat \n"
1933 " vmull.u8 q12, d23, d22 @ premultiply alpha red \n"
1934 " vmull.u8 q13, d23, d21 @ premultiply alpha green \n"
1935 " vmull.u8 q14, d23, d20 @ premultiply alpha blue \n"
1936 " vmvn d18, d23 @ inverse alpha for background \n"
1938 " vld1.16 {d0,d1}, [%[dest]] @ load first pixels from framebuffer \n"
1939 " vshrn.u16 d2, q0, #8 @ unpack red from framebuffer pixels \n"
1940 " vshrn.u16 d4, q0, #3 @ unpack green \n"
1941 " vsli.u16 q3, q0, #5 @ duplicate framebuffer blue bits \n"
1942 " vsri.u8 d2, d2, #5 @ duplicate red bits (extend 5 to 8) \n"
1943 " vsri.u8 d4, d4, #6 @ duplicate green bits (extend 6 to 8) \n"
1944 " vshrn.u16 d6, q3, #2 @ unpack extended blue (truncate 10 to 8) \n"
1945 " vmov q0, q12 @ retrieve foreground red \n"
1946 " vmlal.u8 q0, d2, d18 @ blend red - my kingdom for a four-operand MLA \n"
1947 " vmov q1, q13 @ retrieve foreground green \n"
1948 " vmlal.u8 q1, d4, d18 @ blend green \n"
1949 " vmov q2, q14 @ retrieve foreground blue \n"
1950 " vmlal.u8 q2, d6, d18 @ blend blue \n"
1951 " subs %[count], %[count], #1 @ decrement/test loop counter \n"
1952 " vsri.16 q0, q1, #5 @ pack green behind red \n"
1953 " vsri.16 q0, q2, #11 @ pack blue into pixels \n"
1954 " vst1.16 {d0,d1}, [%[dest]] @ store composited pixels \n"
1955 " add %[dest], %[dest], %[dest_stride] @ advance framebuffer pointer \n"
1956 " bne 0b @ next please \n"
1958 /* Clobbered registers marked as input/outputs */
1959 : [dest] "+r" (dest), [count] "+r" (count)
1962 : [dest_stride] "r" (dest_stride), [colour] "r" (&colour)
1964 /* Clobbers, including the inputs we modify, and
1965 * potentially lots of memory
1967 : "q0", "q1", "q2", "q3", "q9",
1968 "q10", "q11", "q12", "q13", "q14",
1974 neon_composite_over_n_0565 (pixman_implementation_t * impl,
1976 pixman_image_t * src_image,
1977 pixman_image_t * mask_image,
1978 pixman_image_t * dst_image,
1989 uint16_t *dst_line, *aligned_line;
1990 uint32_t dst_stride;
1991 uint32_t kernel_count, copy_count, copy_tail;
1992 uint8_t kernel_offset, copy_offset;
1994 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1996 /* bail out if fully transparent */
2001 if (width == 0 || height == 0)
2004 if (width > NEON_SCANLINE_BUFFER_PIXELS)
2006 /* split the blit, so we can use a fixed-size scanline buffer *
2007 * TODO: there must be a more elegant way of doing this.
2011 for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
2013 neon_composite_over_n_0565 (
2015 src_image, mask_image, dst_image,
2016 src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
2017 (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
2022 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2024 /* keep within minimum number of aligned quadwords on width
2025 * while also keeping the minimum number of columns to process
2028 unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
2029 unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
2030 unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
2032 /* the fast copy should be quadword aligned */
2033 copy_offset = dst_line - ((uint16_t*) aligned_left);
2034 aligned_line = dst_line - copy_offset;
2035 copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
2038 if (aligned_right - aligned_left > ceiling_length)
2040 /* unaligned routine is tightest */
2041 kernel_count = (uint32_t) (ceiling_length >> 4);
2042 kernel_offset = copy_offset;
2046 /* aligned routine is equally tight, so it is safer to align */
2047 kernel_count = copy_count;
2051 /* We should avoid reading beyond scanline ends for safety */
2052 if (aligned_line < (dst_line - dest_x) ||
2053 (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
2055 /* switch to precise read */
2056 copy_offset = kernel_offset = 0;
2057 aligned_line = dst_line;
2058 kernel_count = (uint32_t) (ceiling_length >> 4);
2059 copy_count = (width * sizeof(*dst_line)) >> 4;
2060 copy_tail = (width * sizeof(*dst_line)) & 0xF;
2065 uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8]; /* deliberately not initialised */
2067 /* row-major order */
2068 /* left edge, middle block, right edge */
2069 for ( ; height--; aligned_line += dst_stride, dst_line += dst_stride)
2071 /* Uncached framebuffer access is really, really slow if we do it piecemeal.
2072 * It should be much faster if we grab it all at once.
2073 * One scanline should easily fit in L1 cache, so this should
2074 * not waste RAM bandwidth.
2076 neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
2078 /* Apply the actual filter */
2079 plain_over_565_8_pix_neon (
2080 src, scan_line + kernel_offset, 8 * sizeof(*dst_line), kernel_count);
2082 /* Copy the modified scanline back */
2083 neon_quadword_copy (
2084 dst_line, scan_line + copy_offset, width >> 3, (width & 7) * 2);
2090 ARGB8_over_565_8_pix_neon (uint32_t *src,
2092 uint32_t src_stride, /* bytes, not elements */
2093 uint32_t count /* 8-pixel groups */)
2097 " pld [%[src], %[src_stride]] @ preload from next scanline \n"
2098 " vld1.16 {d0,d1}, [%[dest]] @ load pixels from framebuffer \n"
2099 " vld4.8 {d20,d21,d22,d23},[%[src]]! @ load source image pixels \n"
2100 " vsli.u16 q3, q0, #5 @ duplicate framebuffer blue bits \n"
2101 " vshrn.u16 d2, q0, #8 @ unpack red from framebuffer pixels \n"
2102 " vshrn.u16 d4, q0, #3 @ unpack green \n"
2103 " vmvn d18, d23 @ we need the inverse alpha for the background \n"
2104 " vsri.u8 d2, d2, #5 @ duplicate red bits (extend 5 to 8) \n"
2105 " vshrn.u16 d6, q3, #2 @ unpack extended blue (truncate 10 to 8) \n"
2106 " vsri.u8 d4, d4, #6 @ duplicate green bits (extend 6 to 8) \n"
2107 " vmull.u8 q1, d2, d18 @ apply inverse alpha to background red... \n"
2108 " vmull.u8 q2, d4, d18 @ ...green... \n"
2109 " vmull.u8 q3, d6, d18 @ ...blue \n"
2110 " subs %[count], %[count], #1 @ decrement/test loop counter \n"
2111 " vmlal.u8 q1, d23, d22 @ add blended foreground red... \n"
2112 " vmlal.u8 q2, d23, d21 @ ...green... \n"
2113 " vmlal.u8 q3, d23, d20 @ ...blue \n"
2114 " vsri.16 q1, q2, #5 @ pack green behind red \n"
2115 " vsri.16 q1, q3, #11 @ pack blue into pixels \n"
2116 " vst1.16 {d2,d3}, [%[dest]]! @ store composited pixels \n"
2117 " bne 0b @ next please \n"
2119 /* Clobbered registers marked as input/outputs */
2120 : [dest] "+r" (dest), [src] "+r" (src), [count] "+r" (count)
2123 : [src_stride] "r" (src_stride)
2125 /* Clobbers, including the inputs we modify, and potentially lots of memory */
2126 : "q0", "q1", "q2", "q3", "d17", "d18", "q10", "q11", "cc", "memory"
2131 neon_composite_over_8888_0565 (pixman_implementation_t * impl,
2133 pixman_image_t * src_image,
2134 pixman_image_t * mask_image,
2135 pixman_image_t * dst_image,
2146 uint16_t *dst_line, *aligned_line;
2147 uint32_t dst_stride, src_stride;
2148 uint32_t kernel_count, copy_count, copy_tail;
2149 uint8_t kernel_offset, copy_offset;
2151 /* we assume mask is opaque
2152 * so the only alpha to deal with is embedded in src
2154 if (width > NEON_SCANLINE_BUFFER_PIXELS)
2156 /* split the blit, so we can use a fixed-size scanline buffer */
2158 for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
2160 neon_composite_over_8888_0565 (
2162 src_image, mask_image, dst_image,
2163 src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
2164 (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
2169 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2170 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2172 /* keep within minimum number of aligned quadwords on width
2173 * while also keeping the minimum number of columns to process
2176 unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
2177 unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
2178 unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
2180 /* the fast copy should be quadword aligned */
2181 copy_offset = dst_line - ((uint16_t*) aligned_left);
2182 aligned_line = dst_line - copy_offset;
2183 copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
2186 if (aligned_right - aligned_left > ceiling_length)
2188 /* unaligned routine is tightest */
2189 kernel_count = (uint32_t) (ceiling_length >> 4);
2190 kernel_offset = copy_offset;
2194 /* aligned routine is equally tight, so it is safer to align */
2195 kernel_count = copy_count;
2199 /* We should avoid reading beyond scanline ends for safety */
2200 if (aligned_line < (dst_line - dest_x) ||
2201 (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
2203 /* switch to precise read */
2204 copy_offset = kernel_offset = 0;
2205 aligned_line = dst_line;
2206 kernel_count = (uint32_t) (ceiling_length >> 4);
2207 copy_count = (width * sizeof(*dst_line)) >> 4;
2208 copy_tail = (width * sizeof(*dst_line)) & 0xF;
2212 /* Preload the first input scanline */
2214 uint8_t *src_ptr = (uint8_t*) src_line;
2215 uint32_t count = (width + 15) / 16;
2217 #ifdef USE_GCC_INLINE_ASM
2220 " subs %[count], %[count], #1 \n"
2222 " add %[src], %[src], #64 \n"
2225 /* Clobbered input registers marked as input/outputs */
2226 : [src] "+r" (src_ptr), [count] "+r" (count)
2227 : /* no unclobbered inputs */
2241 uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8]; /* deliberately not initialised */
2243 /* row-major order */
2244 /* left edge, middle block, right edge */
2245 for ( ; height--; src_line += src_stride, aligned_line += dst_stride)
2247 /* Uncached framebuffer access is really, really slow if we do
2248 * it piecemeal. It should be much faster if we grab it all at
2249 * once. One scanline should easily fit in L1 cache, so this
2250 * should not waste RAM bandwidth.
2252 neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
2254 /* Apply the actual filter */
2255 ARGB8_over_565_8_pix_neon (
2256 src_line, scan_line + kernel_offset,
2257 src_stride * sizeof(*src_line), kernel_count);
2259 /* Copy the modified scanline back */
2260 neon_quadword_copy (dst_line,
2261 scan_line + copy_offset,
2262 width >> 3, (width & 7) * 2);
2267 #endif /* USE_GCC_INLINE_ASM */
2269 static const pixman_fast_path_t arm_neon_fast_path_array[] =
2271 { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, neon_composite_add_8888_8_8, 0 },
2272 { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, neon_composite_add_8000_8000, 0 },
2273 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, neon_composite_over_n_8_0565, 0 },
2274 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, neon_composite_over_n_8_0565, 0 },
2275 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_src_24_16, 0 },
2276 { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_src_24_16, 0 },
2277 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_src_24_16, 0 },
2278 { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_src_24_16, 0 },
2279 #ifdef USE_GCC_INLINE_ASM
2280 { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_src_16_16, 0 },
2281 { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_src_16_16, 0 },
2282 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_over_n_0565, 0 },
2283 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_over_n_0565, 0 },
2284 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_over_8888_0565, 0 },
2285 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_over_8888_0565, 0 },
2287 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, neon_composite_over_8888_8888, 0 },
2288 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, neon_composite_over_8888_8888, 0 },
2289 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, neon_composite_over_8888_8888, 0 },
2290 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, neon_composite_over_8888_8888, 0 },
2291 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
2292 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
2293 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, neon_composite_over_n_8_8888, 0 },
2294 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, neon_composite_over_n_8_8888, 0 },
2295 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, neon_composite_over_n_8_8888, 0 },
2296 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, neon_composite_over_n_8_8888, 0 },
2300 const pixman_fast_path_t *const arm_neon_fast_paths = arm_neon_fast_path_array;
2303 arm_neon_composite (pixman_implementation_t *imp,
2305 pixman_image_t * src,
2306 pixman_image_t * mask,
2307 pixman_image_t * dest,
2317 if (_pixman_run_fast_path (arm_neon_fast_paths, imp,
2318 op, src, mask, dest,
2327 _pixman_implementation_composite (imp->delegate, op,
2335 static pixman_bool_t
2336 pixman_blt_neon (void *src_bits,
2349 if (!width || !height)
2352 /* accelerate only straight copies involving complete bytes */
2353 if (src_bpp != dst_bpp || (src_bpp & 7))
2357 uint32_t bytes_per_pixel = src_bpp >> 3;
2358 uint32_t byte_width = width * bytes_per_pixel;
2359 /* parameter is in words for some reason */
2360 int32_t src_stride_bytes = src_stride * 4;
2361 int32_t dst_stride_bytes = dst_stride * 4;
2362 uint8_t *src_bytes = ((uint8_t*) src_bits) +
2363 src_y * src_stride_bytes + src_x * bytes_per_pixel;
2364 uint8_t *dst_bytes = ((uint8_t*) dst_bits) +
2365 dst_y * dst_stride_bytes + dst_x * bytes_per_pixel;
2366 uint32_t quadword_count = byte_width / 16;
2367 uint32_t offset = byte_width % 16;
2371 neon_quadword_copy (dst_bytes, src_bytes, quadword_count, offset);
2372 src_bytes += src_stride_bytes;
2373 dst_bytes += dst_stride_bytes;
2380 static pixman_bool_t
2381 arm_neon_blt (pixman_implementation_t *imp,
2382 uint32_t * src_bits,
2383 uint32_t * dst_bits,
2395 if (pixman_blt_neon (
2396 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
2397 src_x, src_y, dst_x, dst_y, width, height))
2402 return _pixman_implementation_blt (
2404 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
2405 src_x, src_y, dst_x, dst_y, width, height);
2408 static pixman_bool_t
2409 arm_neon_fill (pixman_implementation_t *imp,
2419 if (pixman_fill_neon (bits, stride, bpp, x, y, width, height, xor))
2422 return _pixman_implementation_fill (
2423 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
2426 pixman_implementation_t *
2427 _pixman_implementation_create_arm_neon (void)
2429 pixman_implementation_t *simd = _pixman_implementation_create_arm_simd ();
2430 pixman_implementation_t *imp = _pixman_implementation_create (simd);
2432 imp->composite = arm_neon_composite;
2433 imp->blt = arm_neon_blt;
2434 imp->fill = arm_neon_fill;