2 * Copyright © 2009 ARM Ltd, Movial Creative Technologies Oy
4 * Permission to use, copy, modify, distribute, and sell this software and its
5 * documentation for any purpose is hereby granted without fee, provided that
6 * the above copyright notice appear in all copies and that both that
7 * copyright notice and this permission notice appear in supporting
8 * documentation, and that the name of ARM Ltd not be used in
9 * advertising or publicity pertaining to distribution of the software without
10 * specific, written prior permission. ARM Ltd makes no
11 * representations about the suitability of this software for any purpose. It
12 * is provided "as is" without express or implied warranty.
14 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
15 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
16 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
17 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
19 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
20 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23 * Author: Ian Rickards (ian.rickards@arm.com)
24 * Author: Jonathan Morton (jonathan.morton@movial.com)
25 * Author: Markku Vire (markku.vire@movial.com)
35 #include "pixman-private.h"
37 /* Deal with an intrinsic that is defined differently in GCC */
38 #if !defined(__ARMCC_VERSION) && !defined(__pld)
39 #define __pld(_x) __builtin_prefetch (_x)
42 static force_inline uint8x8x4_t
43 unpack0565 (uint16x8_t rgb)
48 res.val[3] = vdup_n_u8 (0);
49 gb = vshrq_n_u16 (rgb, 5);
50 b = vshrq_n_u16 (rgb, 5 + 6);
52 res.val[0] = vmovn_u16 (rgb); /* get low 5 bits */
53 res.val[1] = vmovn_u16 (gb); /* get mid 6 bits */
54 res.val[2] = vmovn_u16 (b); /* get top 5 bits */
56 res.val[0] = vshl_n_u8 (res.val[0], 3); /* shift to top */
57 res.val[1] = vshl_n_u8 (res.val[1], 2); /* shift to top */
58 res.val[2] = vshl_n_u8 (res.val[2], 3); /* shift to top */
60 res.val[0] = vsri_n_u8 (res.val[0], res.val[0], 5);
61 res.val[1] = vsri_n_u8 (res.val[1], res.val[1], 6);
62 res.val[2] = vsri_n_u8 (res.val[2], res.val[2], 5);
67 static force_inline uint16x8_t
68 pack0565 (uint8x8x4_t s)
70 uint16x8_t rgb, val_g, val_r;
72 rgb = vshll_n_u8 (s.val[2], 8);
73 val_g = vshll_n_u8 (s.val[1], 8);
74 val_r = vshll_n_u8 (s.val[0], 8);
75 rgb = vsriq_n_u16 (rgb, val_g, 5);
76 rgb = vsriq_n_u16 (rgb, val_r, 5 + 6);
81 static force_inline uint8x8_t
82 neon2mul (uint8x8_t x,
88 tmp = vmull_u8 (x, alpha);
89 tmp2 = vrshrq_n_u16 (tmp, 8);
90 res = vraddhn_u16 (tmp, tmp2);
95 static force_inline uint8x8x4_t
96 neon8mul (uint8x8x4_t x,
101 uint16x8_t qtmp1, qtmp2;
103 tmp.val[0] = vmull_u8 (x.val[0], alpha);
104 tmp.val[1] = vmull_u8 (x.val[1], alpha);
105 tmp.val[2] = vmull_u8 (x.val[2], alpha);
106 tmp.val[3] = vmull_u8 (x.val[3], alpha);
108 qtmp1 = vrshrq_n_u16 (tmp.val[0], 8);
109 qtmp2 = vrshrq_n_u16 (tmp.val[1], 8);
110 res.val[0] = vraddhn_u16 (tmp.val[0], qtmp1);
111 qtmp1 = vrshrq_n_u16 (tmp.val[2], 8);
112 res.val[1] = vraddhn_u16 (tmp.val[1], qtmp2);
113 qtmp2 = vrshrq_n_u16 (tmp.val[3], 8);
114 res.val[2] = vraddhn_u16 (tmp.val[2], qtmp1);
115 res.val[3] = vraddhn_u16 (tmp.val[3], qtmp2);
120 static force_inline uint8x8x4_t
121 neon8qadd (uint8x8x4_t x,
126 res.val[0] = vqadd_u8 (x.val[0], y.val[0]);
127 res.val[1] = vqadd_u8 (x.val[1], y.val[1]);
128 res.val[2] = vqadd_u8 (x.val[2], y.val[2]);
129 res.val[3] = vqadd_u8 (x.val[3], y.val[3]);
135 neon_composite_add_8000_8000 (pixman_implementation_t * impl,
137 pixman_image_t * src_image,
138 pixman_image_t * mask_image,
139 pixman_image_t * dst_image,
149 uint8_t *dst_line, *dst;
150 uint8_t *src_line, *src;
151 int dst_stride, src_stride;
154 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
155 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
159 /* Use overlapping 8-pixel method */
162 uint8_t *keep_dst = 0;
163 uint8x8_t sval, dval, temp;
166 dst_line += dst_stride;
168 src_line += src_stride;
171 #ifndef USE_GCC_INLINE_ASM
172 sval = vld1_u8 ((void*)src);
173 dval = vld1_u8 ((void*)dst);
176 temp = vqadd_u8 (dval, sval);
184 sval = vld1_u8 ((void*)src);
185 dval = vld1_u8 ((void*)dst);
187 vst1_u8 ((void*)keep_dst, temp);
190 temp = vqadd_u8 (dval, sval);
197 vst1_u8 ((void*)keep_dst, temp);
200 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
201 "vld1.8 {d0}, [%[src]]\n\t"
202 "vld1.8 {d4}, [%[dst]]\n\t"
203 "mov %[keep_dst], %[dst]\n\t"
205 "and ip, %[w], #7\n\t"
206 "add %[src], %[src], ip\n\t"
207 "add %[dst], %[dst], ip\n\t"
208 "subs %[w], %[w], ip\n\t"
212 "vld1.8 {d0}, [%[src]]!\n\t"
213 "vld1.8 {d4}, [%[dst]]!\n\t"
214 "vst1.8 {d20}, [%[keep_dst]]\n\t"
215 "sub %[keep_dst], %[dst], #8\n\t"
216 "subs %[w], %[w], #8\n\t"
218 "vqadd.u8 d20, d0, d4\n\t"
223 "vst1.8 {d20}, [%[keep_dst]]\n\t"
225 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
227 : "ip", "cc", "memory", "d0", "d4",
235 const uint8_t nil = 0;
236 const uint8x8_t vnil = vld1_dup_u8 (&nil);
240 uint8x8_t sval = vnil, dval = vnil;
241 uint8_t *dst4 = 0, *dst2 = 0;
244 dst_line += dst_stride;
246 src_line += src_stride;
251 sval = vreinterpret_u8_u32 (
252 vld1_lane_u32 ((void*)src, vreinterpret_u32_u8 (sval), 1));
253 dval = vreinterpret_u8_u32 (
254 vld1_lane_u32 ((void*)dst, vreinterpret_u32_u8 (dval), 1));
263 sval = vreinterpret_u8_u16 (
264 vld1_lane_u16 ((void*)src, vreinterpret_u16_u8 (sval), 1));
265 dval = vreinterpret_u8_u16 (
266 vld1_lane_u16 ((void*)dst, vreinterpret_u16_u8 (dval), 1));
275 sval = vld1_lane_u8 (src, sval, 1);
276 dval = vld1_lane_u8 (dst, dval, 1);
279 dval = vqadd_u8 (dval, sval);
282 vst1_lane_u8 (dst, dval, 1);
285 vst1_lane_u16 ((void*)dst2, vreinterpret_u16_u8 (dval), 1);
288 vst1_lane_u32 ((void*)dst4, vreinterpret_u32_u8 (dval), 1);
294 neon_composite_over_8888_8888 (pixman_implementation_t * impl,
296 pixman_image_t * src_image,
297 pixman_image_t * mask_image,
298 pixman_image_t * dst_image,
308 uint32_t *dst_line, *dst;
309 uint32_t *src_line, *src;
310 int dst_stride, src_stride;
313 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
314 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
318 /* Use overlapping 8-pixel method */
321 uint32_t *keep_dst = 0;
322 uint8x8x4_t sval, dval, temp;
325 dst_line += dst_stride;
327 src_line += src_stride;
330 #ifndef USE_GCC_INLINE_ASM
331 sval = vld4_u8 ((void*)src);
332 dval = vld4_u8 ((void*)dst);
335 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
336 temp = neon8qadd (sval, temp);
344 sval = vld4_u8 ((void*)src);
345 dval = vld4_u8 ((void*)dst);
347 vst4_u8 ((void*)keep_dst, temp);
350 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
351 temp = neon8qadd (sval, temp);
358 vst4_u8 ((void*)keep_dst, temp);
361 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
362 "vld4.8 {d0-d3}, [%[src]]\n\t"
363 "vld4.8 {d4-d7}, [%[dst]]\n\t"
364 "mov %[keep_dst], %[dst]\n\t"
366 "and ip, %[w], #7\n\t"
367 "add %[src], %[src], ip, LSL#2\n\t"
368 "add %[dst], %[dst], ip, LSL#2\n\t"
369 "subs %[w], %[w], ip\n\t"
373 "vld4.8 {d0-d3}, [%[src]]!\n\t"
374 "vld4.8 {d4-d7}, [%[dst]]!\n\t"
375 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
376 "sub %[keep_dst], %[dst], #8*4\n\t"
377 "subs %[w], %[w], #8\n\t"
380 "vmull.u8 q10, d31, d4\n\t"
381 "vmull.u8 q11, d31, d5\n\t"
382 "vmull.u8 q12, d31, d6\n\t"
383 "vmull.u8 q13, d31, d7\n\t"
384 "vrshr.u16 q8, q10, #8\n\t"
385 "vrshr.u16 q9, q11, #8\n\t"
386 "vraddhn.u16 d20, q10, q8\n\t"
387 "vraddhn.u16 d21, q11, q9\n\t"
388 "vrshr.u16 q8, q12, #8\n\t"
389 "vrshr.u16 q9, q13, #8\n\t"
390 "vraddhn.u16 d22, q12, q8\n\t"
391 "vraddhn.u16 d23, q13, q9\n\t"
392 /* result in d20-d23 */
393 "vqadd.u8 d20, d0, d20\n\t"
394 "vqadd.u8 d21, d1, d21\n\t"
395 "vqadd.u8 d22, d2, d22\n\t"
396 "vqadd.u8 d23, d3, d23\n\t"
401 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
403 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
405 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
406 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23"
413 uint8x8_t alpha_selector = vreinterpret_u8_u64 (
414 vcreate_u64 (0x0707070703030303ULL));
416 /* Handle width < 8 */
420 dst_line += dst_stride;
422 src_line += src_stride;
427 uint8x8_t sval, dval;
429 /* two 32-bit pixels packed into D-reg; ad-hoc vectorization */
430 sval = vreinterpret_u8_u32 (vld1_u32 ((void*)src));
431 dval = vreinterpret_u8_u32 (vld1_u32 ((void*)dst));
432 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
433 vst1_u8 ((void*)dst, vqadd_u8 (sval, dval));
442 uint8x8_t sval, dval;
444 /* single 32-bit pixel in lane 0 */
445 sval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)src)); /* only interested in lane 0 */
446 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)dst)); /* only interested in lane 0 */
447 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
448 vst1_lane_u32 ((void*)dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
455 neon_composite_over_8888_n_8888 (pixman_implementation_t * impl,
457 pixman_image_t * src_image,
458 pixman_image_t * mask_image,
459 pixman_image_t * dst_image,
469 uint32_t *dst_line, *dst;
470 uint32_t *src_line, *src;
472 int dst_stride, src_stride;
474 uint8x8_t mask_alpha;
476 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
477 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
479 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
480 mask_alpha = vdup_n_u8 ((mask) >> 24);
484 /* Use overlapping 8-pixel method */
488 dst_line += dst_stride;
490 src_line += src_stride;
493 uint32_t *keep_dst = 0;
495 #ifndef USE_GCC_INLINE_ASM
496 uint8x8x4_t sval, dval, temp;
498 sval = vld4_u8 ((void*)src);
499 dval = vld4_u8 ((void*)dst);
502 sval = neon8mul (sval, mask_alpha);
503 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
504 temp = neon8qadd (sval, temp);
512 sval = vld4_u8 ((void*)src);
513 dval = vld4_u8 ((void*)dst);
515 vst4_u8 ((void*)keep_dst, temp);
518 sval = neon8mul (sval, mask_alpha);
519 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
520 temp = neon8qadd (sval, temp);
526 vst4_u8 ((void*)keep_dst, temp);
529 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
530 "vdup.32 d30, %[mask]\n\t"
531 "vdup.8 d30, d30[3]\n\t"
533 "vld4.8 {d0-d3}, [%[src]]\n\t"
534 "vld4.8 {d4-d7}, [%[dst]]\n\t"
535 "mov %[keep_dst], %[dst]\n\t"
537 "and ip, %[w], #7\n\t"
538 "add %[src], %[src], ip, LSL#2\n\t"
539 "add %[dst], %[dst], ip, LSL#2\n\t"
540 "subs %[w], %[w], ip\n\t"
544 "vld4.8 {d0-d3}, [%[src]]!\n\t"
545 "vld4.8 {d4-d7}, [%[dst]]!\n\t"
546 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
547 "sub %[keep_dst], %[dst], #8*4\n\t"
548 "subs %[w], %[w], #8\n\t"
551 "vmull.u8 q10, d30, d0\n\t"
552 "vmull.u8 q11, d30, d1\n\t"
553 "vmull.u8 q12, d30, d2\n\t"
554 "vmull.u8 q13, d30, d3\n\t"
555 "vrshr.u16 q8, q10, #8\n\t"
556 "vrshr.u16 q9, q11, #8\n\t"
557 "vraddhn.u16 d0, q10, q8\n\t"
558 "vraddhn.u16 d1, q11, q9\n\t"
559 "vrshr.u16 q9, q13, #8\n\t"
560 "vrshr.u16 q8, q12, #8\n\t"
561 "vraddhn.u16 d3, q13, q9\n\t"
562 "vraddhn.u16 d2, q12, q8\n\t"
565 "vmull.u8 q10, d31, d4\n\t"
566 "vmull.u8 q11, d31, d5\n\t"
567 "vmull.u8 q12, d31, d6\n\t"
568 "vmull.u8 q13, d31, d7\n\t"
569 "vrshr.u16 q8, q10, #8\n\t"
570 "vrshr.u16 q9, q11, #8\n\t"
571 "vraddhn.u16 d20, q10, q8\n\t"
572 "vrshr.u16 q8, q12, #8\n\t"
573 "vraddhn.u16 d21, q11, q9\n\t"
574 "vrshr.u16 q9, q13, #8\n\t"
575 "vraddhn.u16 d22, q12, q8\n\t"
576 "vraddhn.u16 d23, q13, q9\n\t"
578 /* result in d20-d23 */
579 "vqadd.u8 d20, d0, d20\n\t"
580 "vqadd.u8 d21, d1, d21\n\t"
581 "vqadd.u8 d22, d2, d22\n\t"
582 "vqadd.u8 d23, d3, d23\n\t"
587 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
589 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
591 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
592 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
600 uint8x8_t alpha_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0707070703030303ULL));
602 /* Handle width < 8 */
606 dst_line += dst_stride;
608 src_line += src_stride;
613 uint8x8_t sval, dval;
615 sval = vreinterpret_u8_u32 (vld1_u32 ((void*)src));
616 dval = vreinterpret_u8_u32 (vld1_u32 ((void*)dst));
618 /* sval * const alpha_mul */
619 sval = neon2mul (sval, mask_alpha);
621 /* dval * 255-(src alpha) */
622 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
624 vst1_u8 ((void*)dst, vqadd_u8 (sval, dval));
633 uint8x8_t sval, dval;
635 sval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)src));
636 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)dst));
638 /* sval * const alpha_mul */
639 sval = neon2mul (sval, mask_alpha);
641 /* dval * 255-(src alpha) */
642 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
644 vst1_lane_u32 ((void*)dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
651 neon_composite_over_n_8_0565 (pixman_implementation_t * impl,
653 pixman_image_t * src_image,
654 pixman_image_t * mask_image,
655 pixman_image_t * dst_image,
666 uint16_t *dst_line, *dst;
667 uint8_t *mask_line, *mask;
668 int dst_stride, mask_stride;
673 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
679 sval2=vreinterpret_u8_u32 (vdup_n_u32 (src));
680 sval8.val[0]=vdup_lane_u8 (sval2,0);
681 sval8.val[1]=vdup_lane_u8 (sval2,1);
682 sval8.val[2]=vdup_lane_u8 (sval2,2);
683 sval8.val[3]=vdup_lane_u8 (sval2,3);
685 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
686 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
690 /* Use overlapping 8-pixel method, modified to avoid rewritten dest being reused */
693 uint16_t *keep_dst=0;
696 dst_line += dst_stride;
698 mask_line += mask_stride;
701 #ifndef USE_GCC_INLINE_ASM
703 uint16x8_t dval, temp;
704 uint8x8x4_t sval8temp;
706 alpha = vld1_u8 ((void*)mask);
707 dval = vld1q_u16 ((void*)dst);
710 sval8temp = neon8mul (sval8,alpha);
711 temp = pack0565 (neon8qadd (sval8temp,neon8mul (unpack0565 (dval),vmvn_u8 (sval8temp.val[3]))));
719 dval = vld1q_u16 ((void*)dst);
720 alpha = vld1_u8 ((void*)mask);
722 vst1q_u16 ((void*)keep_dst,temp);
725 sval8temp = neon8mul (sval8,alpha);
726 temp = pack0565 (neon8qadd (sval8temp,neon8mul (unpack0565 (dval),vmvn_u8 (sval8temp.val[3]))));
732 vst1q_u16 ((void*)keep_dst,temp);
735 "vdup.32 d0, %[src]\n\t"
736 "vdup.8 d1, d0[1]\n\t"
737 "vdup.8 d2, d0[2]\n\t"
738 "vdup.8 d3, d0[3]\n\t"
739 "vdup.8 d0, d0[0]\n\t"
741 "vld1.8 {q12}, [%[dst]]\n\t"
742 "vld1.8 {d31}, [%[mask]]\n\t"
743 "mov %[keep_dst], %[dst]\n\t"
745 "and ip, %[w], #7\n\t"
746 "add %[mask], %[mask], ip\n\t"
747 "add %[dst], %[dst], ip, LSL#1\n\t"
748 "subs %[w], %[w], ip\n\t"
753 "vld1.16 {q12}, [%[dst]]!\n\t"
754 "vld1.8 {d31}, [%[mask]]!\n\t"
755 "vst1.16 {q10}, [%[keep_dst]]\n\t"
756 "sub %[keep_dst], %[dst], #8*2\n\t"
757 "subs %[w], %[w], #8\n\t"
759 /* expand 0565 q12 to 8888 {d4-d7} */
760 "vmovn.u16 d4, q12\t\n"
761 "vshr.u16 q11, q12, #5\t\n"
762 "vshr.u16 q10, q12, #6+5\t\n"
763 "vmovn.u16 d5, q11\t\n"
764 "vmovn.u16 d6, q10\t\n"
765 "vshl.u8 d4, d4, #3\t\n"
766 "vshl.u8 d5, d5, #2\t\n"
767 "vshl.u8 d6, d6, #3\t\n"
768 "vsri.u8 d4, d4, #5\t\n"
769 "vsri.u8 d5, d5, #6\t\n"
770 "vsri.u8 d6, d6, #5\t\n"
772 "vmull.u8 q10, d31, d0\n\t"
773 "vmull.u8 q11, d31, d1\n\t"
774 "vmull.u8 q12, d31, d2\n\t"
775 "vmull.u8 q13, d31, d3\n\t"
776 "vrshr.u16 q8, q10, #8\n\t"
777 "vrshr.u16 q9, q11, #8\n\t"
778 "vraddhn.u16 d20, q10, q8\n\t"
779 "vraddhn.u16 d21, q11, q9\n\t"
780 "vrshr.u16 q9, q13, #8\n\t"
781 "vrshr.u16 q8, q12, #8\n\t"
782 "vraddhn.u16 d23, q13, q9\n\t"
783 "vraddhn.u16 d22, q12, q8\n\t"
785 /* duplicate in 4/2/1 & 8pix vsns */
786 "vmvn.8 d30, d23\n\t"
787 "vmull.u8 q14, d30, d6\n\t"
788 "vmull.u8 q13, d30, d5\n\t"
789 "vmull.u8 q12, d30, d4\n\t"
790 "vrshr.u16 q8, q14, #8\n\t"
791 "vrshr.u16 q9, q13, #8\n\t"
792 "vraddhn.u16 d6, q14, q8\n\t"
793 "vrshr.u16 q8, q12, #8\n\t"
794 "vraddhn.u16 d5, q13, q9\n\t"
795 "vqadd.u8 d6, d6, d22\n\t" /* moved up */
796 "vraddhn.u16 d4, q12, q8\n\t"
797 /* intentionally don't calculate alpha */
798 /* result in d4-d6 */
800 /* "vqadd.u8 d6, d6, d22\n\t" ** moved up */
801 "vqadd.u8 d5, d5, d21\n\t"
802 "vqadd.u8 d4, d4, d20\n\t"
804 /* pack 8888 {d20-d23} to 0565 q10 */
805 "vshll.u8 q10, d6, #8\n\t"
806 "vshll.u8 q3, d5, #8\n\t"
807 "vshll.u8 q2, d4, #8\n\t"
808 "vsri.u16 q10, q3, #5\t\n"
809 "vsri.u16 q10, q2, #11\t\n"
814 "vst1.16 {q10}, [%[keep_dst]]\n\t"
816 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
818 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
819 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
829 void *dst4=0, *dst2=0;
832 dst_line += dst_stride;
834 mask_line += mask_stride;
838 #ifndef USE_GCC_INLINE_ASM
840 uint16x8_t dval, temp;
841 uint8x8x4_t sval8temp;
845 alpha = vreinterpret_u8_u32 (vld1_lane_u32 ((void*)mask,vreinterpret_u32_u8 (alpha),1));
846 dval = vreinterpretq_u16_u64 (vld1q_lane_u64 ((void*)dst,vreinterpretq_u64_u16 (dval),1));
853 alpha = vreinterpret_u8_u16 (vld1_lane_u16 ((void*)mask,vreinterpret_u16_u8 (alpha),1));
854 dval = vreinterpretq_u16_u32 (vld1q_lane_u32 ((void*)dst,vreinterpretq_u32_u16 (dval),1));
861 alpha = vld1_lane_u8 ((void*)mask,alpha,1);
862 dval = vld1q_lane_u16 ((void*)dst,dval,1);
865 sval8temp = neon8mul (sval8,alpha);
866 temp = pack0565 (neon8qadd (sval8temp,neon8mul (unpack0565 (dval),vmvn_u8 (sval8temp.val[3]))));
869 vst1q_lane_u16 ((void*)dst,temp,1);
871 vst1q_lane_u32 ((void*)dst2,vreinterpretq_u32_u16 (temp),1);
873 vst1q_lane_u64 ((void*)dst4,vreinterpretq_u64_u16 (temp),1);
876 "vdup.32 d0, %[src]\n\t"
877 "vdup.8 d1, d0[1]\n\t"
878 "vdup.8 d2, d0[2]\n\t"
879 "vdup.8 d3, d0[3]\n\t"
880 "vdup.8 d0, d0[0]\n\t"
885 "vld1.64 {d25}, [%[dst]]\n\t"
886 "vld1.32 {d31[1]}, [%[mask]]\n\t"
887 "mov %[dst4], %[dst]\t\n"
888 "add %[mask], %[mask], #4\t\n"
889 "add %[dst], %[dst], #4*2\t\n"
894 "vld1.32 {d24[1]}, [%[dst]]\n\t"
895 "vld1.16 {d31[1]}, [%[mask]]\n\t"
896 "mov %[dst2], %[dst]\t\n"
897 "add %[mask], %[mask], #2\t\n"
898 "add %[dst], %[dst], #2*2\t\n"
903 "vld1.16 {d24[1]}, [%[dst]]\n\t"
904 "vld1.8 {d31[1]}, [%[mask]]\n\t"
907 /* expand 0565 q12 to 8888 {d4-d7} */
908 "vmovn.u16 d4, q12\t\n"
909 "vshr.u16 q11, q12, #5\t\n"
910 "vshr.u16 q10, q12, #6+5\t\n"
911 "vmovn.u16 d5, q11\t\n"
912 "vmovn.u16 d6, q10\t\n"
913 "vshl.u8 d4, d4, #3\t\n"
914 "vshl.u8 d5, d5, #2\t\n"
915 "vshl.u8 d6, d6, #3\t\n"
916 "vsri.u8 d4, d4, #5\t\n"
917 "vsri.u8 d5, d5, #6\t\n"
918 "vsri.u8 d6, d6, #5\t\n"
920 "vmull.u8 q10, d31, d0\n\t"
921 "vmull.u8 q11, d31, d1\n\t"
922 "vmull.u8 q12, d31, d2\n\t"
923 "vmull.u8 q13, d31, d3\n\t"
924 "vrshr.u16 q8, q10, #8\n\t"
925 "vrshr.u16 q9, q11, #8\n\t"
926 "vraddhn.u16 d20, q10, q8\n\t"
927 "vraddhn.u16 d21, q11, q9\n\t"
928 "vrshr.u16 q9, q13, #8\n\t"
929 "vrshr.u16 q8, q12, #8\n\t"
930 "vraddhn.u16 d23, q13, q9\n\t"
931 "vraddhn.u16 d22, q12, q8\n\t"
933 /* duplicate in 4/2/1 & 8pix vsns */
934 "vmvn.8 d30, d23\n\t"
935 "vmull.u8 q14, d30, d6\n\t"
936 "vmull.u8 q13, d30, d5\n\t"
937 "vmull.u8 q12, d30, d4\n\t"
938 "vrshr.u16 q8, q14, #8\n\t"
939 "vrshr.u16 q9, q13, #8\n\t"
940 "vraddhn.u16 d6, q14, q8\n\t"
941 "vrshr.u16 q8, q12, #8\n\t"
942 "vraddhn.u16 d5, q13, q9\n\t"
943 "vqadd.u8 d6, d6, d22\n\t" /* moved up */
944 "vraddhn.u16 d4, q12, q8\n\t"
945 /* intentionally don't calculate alpha */
946 /* result in d4-d6 */
948 /* "vqadd.u8 d6, d6, d22\n\t" ** moved up */
949 "vqadd.u8 d5, d5, d21\n\t"
950 "vqadd.u8 d4, d4, d20\n\t"
952 /* pack 8888 {d20-d23} to 0565 q10 */
953 "vshll.u8 q10, d6, #8\n\t"
954 "vshll.u8 q3, d5, #8\n\t"
955 "vshll.u8 q2, d4, #8\n\t"
956 "vsri.u16 q10, q3, #5\t\n"
957 "vsri.u16 q10, q2, #11\t\n"
960 "beq skip_store1\t\n"
961 "vst1.16 {d20[1]}, [%[dst]]\t\n"
964 "beq skip_store2\t\n"
965 "vst1.32 {d20[1]}, [%[dst2]]\t\n"
968 "beq skip_store4\t\n"
969 "vst1.16 {d21}, [%[dst4]]\t\n"
972 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [dst4] "+r" (dst4), [dst2] "+r" (dst2)
974 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
975 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
984 neon_composite_over_n_8_8888 (pixman_implementation_t * impl,
986 pixman_image_t * src_image,
987 pixman_image_t * mask_image,
988 pixman_image_t * dst_image,
999 uint32_t *dst_line, *dst;
1000 uint8_t *mask_line, *mask;
1001 int dst_stride, mask_stride;
1005 uint8x8_t mask_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0101010100000000ULL));
1006 uint8x8_t alpha_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0707070703030303ULL));
1008 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1010 /* bail out if fully transparent */
1015 sval2 = vreinterpret_u8_u32 (vdup_n_u32 (src));
1016 sval8.val[0] = vdup_lane_u8 (sval2, 0);
1017 sval8.val[1] = vdup_lane_u8 (sval2, 1);
1018 sval8.val[2] = vdup_lane_u8 (sval2, 2);
1019 sval8.val[3] = vdup_lane_u8 (sval2, 3);
1021 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1022 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1026 /* Use overlapping 8-pixel method, modified to avoid
1027 * rewritten dest being reused
1031 uint32_t *keep_dst = 0;
1034 dst_line += dst_stride;
1036 mask_line += mask_stride;
1039 #ifndef USE_GCC_INLINE_ASM
1041 uint8x8x4_t dval, temp;
1043 alpha = vld1_u8 ((void*)mask);
1044 dval = vld4_u8 ((void*)dst);
1047 temp = neon8mul (sval8, alpha);
1048 dval = neon8mul (dval, vmvn_u8 (temp.val[3]));
1049 temp = neon8qadd (temp, dval);
1057 alpha = vld1_u8 ((void*)mask);
1058 dval = vld4_u8 ((void*)dst);
1060 vst4_u8 ((void*)keep_dst, temp);
1063 temp = neon8mul (sval8, alpha);
1064 dval = neon8mul (dval, vmvn_u8 (temp.val[3]));
1065 temp = neon8qadd (temp, dval);
1071 vst4_u8 ((void*)keep_dst, temp);
1074 "vdup.32 d0, %[src]\n\t"
1075 "vdup.8 d1, d0[1]\n\t"
1076 "vdup.8 d2, d0[2]\n\t"
1077 "vdup.8 d3, d0[3]\n\t"
1078 "vdup.8 d0, d0[0]\n\t"
1080 "vld4.8 {d4-d7}, [%[dst]]\n\t"
1081 "vld1.8 {d31}, [%[mask]]\n\t"
1082 "mov %[keep_dst], %[dst]\n\t"
1084 "and ip, %[w], #7\n\t"
1085 "add %[mask], %[mask], ip\n\t"
1086 "add %[dst], %[dst], ip, LSL#2\n\t"
1087 "subs %[w], %[w], ip\n\t"
1091 "vld4.8 {d4-d7}, [%[dst]]!\n\t"
1092 "vld1.8 {d31}, [%[mask]]!\n\t"
1093 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
1094 "sub %[keep_dst], %[dst], #8*4\n\t"
1095 "subs %[w], %[w], #8\n\t"
1098 "vmull.u8 q10, d31, d0\n\t"
1099 "vmull.u8 q11, d31, d1\n\t"
1100 "vmull.u8 q12, d31, d2\n\t"
1101 "vmull.u8 q13, d31, d3\n\t"
1102 "vrshr.u16 q8, q10, #8\n\t"
1103 "vrshr.u16 q9, q11, #8\n\t"
1104 "vraddhn.u16 d20, q10, q8\n\t"
1105 "vraddhn.u16 d21, q11, q9\n\t"
1106 "vrshr.u16 q9, q13, #8\n\t"
1107 "vrshr.u16 q8, q12, #8\n\t"
1108 "vraddhn.u16 d23, q13, q9\n\t"
1109 "vraddhn.u16 d22, q12, q8\n\t"
1111 "vmvn.8 d30, d23\n\t"
1112 "vmull.u8 q12, d30, d4\n\t"
1113 "vmull.u8 q13, d30, d5\n\t"
1114 "vmull.u8 q14, d30, d6\n\t"
1115 "vmull.u8 q15, d30, d7\n\t"
1117 "vrshr.u16 q8, q12, #8\n\t"
1118 "vrshr.u16 q9, q13, #8\n\t"
1119 "vraddhn.u16 d4, q12, q8\n\t"
1120 "vrshr.u16 q8, q14, #8\n\t"
1121 "vraddhn.u16 d5, q13, q9\n\t"
1122 "vrshr.u16 q9, q15, #8\n\t"
1123 "vraddhn.u16 d6, q14, q8\n\t"
1124 "vraddhn.u16 d7, q15, q9\n\t"
1125 /* result in d4-d7 */
1127 "vqadd.u8 d20, d4, d20\n\t"
1128 "vqadd.u8 d21, d5, d21\n\t"
1129 "vqadd.u8 d22, d6, d22\n\t"
1130 "vqadd.u8 d23, d7, d23\n\t"
1135 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
1137 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
1139 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
1140 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
1153 dst_line += dst_stride;
1155 mask_line += mask_stride;
1160 uint8x8_t dval, temp, res;
1163 vreinterpret_u8_u16 (vld1_dup_u16 ((void*)mask)), mask_selector);
1164 dval = vld1_u8 ((void*)dst);
1166 temp = neon2mul (sval2, alpha);
1168 temp, neon2mul (dval, vtbl1_u8 (vmvn_u8 (temp), alpha_selector)));
1170 vst1_u8 ((void*)dst, res);
1179 uint8x8_t dval, temp, res;
1181 alpha = vtbl1_u8 (vld1_dup_u8 ((void*)mask), mask_selector);
1182 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void*)dst));
1184 temp = neon2mul (sval2, alpha);
1186 temp, neon2mul (dval, vtbl1_u8 (vmvn_u8 (temp), alpha_selector)));
1188 vst1_lane_u32 ((void*)dst, vreinterpret_u32_u8 (res), 0);
1195 neon_composite_add_8888_8_8 (pixman_implementation_t * impl,
1197 pixman_image_t * src_image,
1198 pixman_image_t * mask_image,
1199 pixman_image_t * dst_image,
1209 uint8_t *dst_line, *dst;
1210 uint8_t *mask_line, *mask;
1211 int dst_stride, mask_stride;
1216 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
1217 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1218 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1219 sa = vdup_n_u8 ((src) >> 24);
1223 /* Use overlapping 8-pixel method, modified to avoid rewritten dest being reused */
1227 dst_line += dst_stride;
1229 mask_line += mask_stride;
1232 uint8x8_t mval, dval, res;
1235 mval = vld1_u8 ((void *)mask);
1236 dval = vld1_u8 ((void *)dst);
1239 res = vqadd_u8 (neon2mul (mval, sa), dval);
1247 mval = vld1_u8 ((void *)mask);
1248 dval = vld1_u8 ((void *)dst);
1249 vst1_u8 ((void *)keep_dst, res);
1252 res = vqadd_u8 (neon2mul (mval, sa), dval);
1258 vst1_u8 ((void *)keep_dst, res);
1263 /* Use 4/2/1 load/store method to handle 1-7 pixels */
1267 dst_line += dst_stride;
1269 mask_line += mask_stride;
1272 uint8x8_t mval = sa, dval = sa, res;
1273 uint8_t *dst4 = 0, *dst2 = 0;
1277 mval = vreinterpret_u8_u32 (
1278 vld1_lane_u32 ((void *)mask, vreinterpret_u32_u8 (mval), 1));
1279 dval = vreinterpret_u8_u32 (
1280 vld1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (dval), 1));
1289 mval = vreinterpret_u8_u16 (
1290 vld1_lane_u16 ((void *)mask, vreinterpret_u16_u8 (mval), 1));
1291 dval = vreinterpret_u8_u16 (
1292 vld1_lane_u16 ((void *)dst, vreinterpret_u16_u8 (dval), 1));
1300 mval = vld1_lane_u8 (mask, mval, 1);
1301 dval = vld1_lane_u8 (dst, dval, 1);
1304 res = vqadd_u8 (neon2mul (mval, sa), dval);
1307 vst1_lane_u8 (dst, res, 1);
1309 vst1_lane_u16 ((void *)dst2, vreinterpret_u16_u8 (res), 1);
1311 vst1_lane_u32 ((void *)dst4, vreinterpret_u32_u8 (res), 1);
1316 #ifdef USE_GCC_INLINE_ASM
1319 neon_composite_src_16_16 (pixman_implementation_t * impl,
1321 pixman_image_t * src_image,
1322 pixman_image_t * mask_image,
1323 pixman_image_t * dst_image,
1333 uint16_t *dst_line, *src_line;
1334 uint32_t dst_stride, src_stride;
1336 if (!height || !width)
1339 /* We simply copy 16-bit-aligned pixels from one place to another. */
1340 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
1341 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1343 /* Preload the first input scanline */
1345 uint16_t *src_ptr = src_line;
1346 uint32_t count = width;
1350 " subs %[count], %[count], #32 \n"
1352 " add %[src], %[src], #64 \n"
1355 /* Clobbered input registers marked as input/outputs */
1356 : [src] "+r" (src_ptr), [count] "+r" (count)
1357 : /* no unclobbered inputs */
1364 uint16_t *dst_ptr = dst_line;
1365 uint16_t *src_ptr = src_line;
1366 uint32_t count = width;
1369 /* Uses multi-register access and preloading to maximise bandwidth.
1370 * Each pixel is one halfword, so a quadword contains 8px.
1371 * Preload frequency assumed a 64-byte cacheline.
1374 " cmp %[count], #64 \n"
1375 " blt 1f @ skip oversized fragments \n"
1376 "0: @ start with eight quadwords at a time \n"
1377 /* preload from next scanline */
1378 " pld [%[src], %[src_stride], LSL #1] \n"
1379 " sub %[count], %[count], #64 \n"
1380 " vld1.16 {d16,d17,d18,d19}, [%[src]]! \n"
1381 " vld1.16 {d20,d21,d22,d23}, [%[src]]! \n"
1382 /* preload from next scanline */
1383 " pld [%[src], %[src_stride], LSL #1] \n"
1384 " vld1.16 {d24,d25,d26,d27}, [%[src]]! \n"
1385 " vld1.16 {d28,d29,d30,d31}, [%[src]]! \n"
1386 " cmp %[count], #64 \n"
1387 " vst1.16 {d16,d17,d18,d19}, [%[dst]]! \n"
1388 " vst1.16 {d20,d21,d22,d23}, [%[dst]]! \n"
1389 " vst1.16 {d24,d25,d26,d27}, [%[dst]]! \n"
1390 " vst1.16 {d28,d29,d30,d31}, [%[dst]]! \n"
1392 " cmp %[count], #0 \n"
1393 " beq 7f @ aligned fastpath \n"
1394 "1: @ four quadwords \n"
1395 " tst %[count], #32 \n"
1396 " beq 2f @ skip oversized fragment \n"
1397 /* preload from next scanline */
1398 " pld [%[src], %[src_stride], LSL #1] \n"
1399 " vld1.16 {d16,d17,d18,d19}, [%[src]]! \n"
1400 " vld1.16 {d20,d21,d22,d23}, [%[src]]! \n"
1401 " vst1.16 {d16,d17,d18,d19}, [%[dst]]! \n"
1402 " vst1.16 {d20,d21,d22,d23}, [%[dst]]! \n"
1403 "2: @ two quadwords \n"
1404 " tst %[count], #16 \n"
1405 " beq 3f @ skip oversized fragment \n"
1406 /* preload from next scanline */
1407 " pld [%[src], %[src_stride], LSL #1] \n"
1408 " vld1.16 {d16,d17,d18,d19}, [%[src]]! \n"
1409 " vst1.16 {d16,d17,d18,d19}, [%[dst]]! \n"
1410 "3: @ one quadword \n"
1411 " tst %[count], #8 \n"
1412 " beq 4f @ skip oversized fragment \n"
1413 " vld1.16 {d16,d17}, [%[src]]! \n"
1414 " vst1.16 {d16,d17}, [%[dst]]! \n"
1415 "4: @ one doubleword \n"
1416 " tst %[count], #4 \n"
1417 " beq 5f @ skip oversized fragment \n"
1418 " vld1.16 {d16}, [%[src]]! \n"
1419 " vst1.16 {d16}, [%[dst]]! \n"
1421 " tst %[count], #2 \n"
1422 " beq 6f @ skip oversized fragment \n"
1423 " ldr %[tmp], [%[src]], #4 \n"
1424 " str %[tmp], [%[dst]], #4 \n"
1425 "6: @ one halfword \n"
1426 " tst %[count], #1 \n"
1427 " beq 7f @ skip oversized fragment \n"
1428 " ldrh %[tmp], [%[src]] \n"
1429 " strh %[tmp], [%[dst]] \n"
1432 /* Clobbered input registers marked as input/outputs */
1433 : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr),
1434 [count] "+r" (count), [tmp] "+r" (tmp)
1436 /* Unclobbered input */
1437 : [src_stride] "r" (src_stride)
1439 /* Clobbered vector registers */
1440 : "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
1441 "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31", "cc", "memory"
1444 src_line += src_stride;
1445 dst_line += dst_stride;
1449 #endif /* USE_GCC_INLINE_ASM */
1452 neon_composite_src_24_16 (pixman_implementation_t * impl,
1454 pixman_image_t * src_image,
1455 pixman_image_t * mask_image,
1456 pixman_image_t * dst_image,
1468 uint32_t dst_stride, src_stride;
1470 if (!width || !height)
1473 /* We simply copy pixels from one place to another,
1474 * assuming that the source's alpha is opaque.
1476 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1477 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1479 /* Preload the first input scanline */
1481 uint8_t *src_ptr = (uint8_t*) src_line;
1482 uint32_t count = (width + 15) / 16;
1484 #ifdef USE_GCC_INLINE_ASM
1487 " subs %[count], %[count], #1 \n"
1489 " add %[src], %[src], #64 \n"
1492 /* Clobbered input registers marked as input/outputs */
1493 : [src] "+r" (src_ptr), [count] "+r" (count)
1494 : /* no unclobbered inputs */
1509 uint16_t *dst_ptr = dst_line;
1510 uint32_t *src_ptr = src_line;
1511 uint32_t count = width;
1512 const uint32_t rb_mask = 0x1F;
1513 const uint32_t g_mask = 0x3F;
1515 /* If you're going to complain about a goto, take a long hard look
1516 * at the massive blocks of assembler this skips over. ;-)
1521 #ifdef USE_GCC_INLINE_ASM
1523 /* This is not as aggressive as the RGB565-source case.
1524 * Generally the source is in cached RAM when the formats are
1525 * different, so we use preload.
1527 * We don't need to blend, so we are not reading from the
1528 * uncached framebuffer.
1531 " cmp %[count], #16 \n"
1532 " blt 1f @ skip oversized fragments \n"
1533 "0: @ start with sixteen pixels at a time \n"
1534 " sub %[count], %[count], #16 \n"
1535 " pld [%[src], %[src_stride], lsl #2] @ preload from next scanline \n"
1536 " vld4.8 {d0,d1,d2,d3}, [%[src]]! @ d3 is alpha and ignored, d2-0 are rgb. \n"
1537 " vld4.8 {d4,d5,d6,d7}, [%[src]]! @ d7 is alpha and ignored, d6-4 are rgb. \n"
1538 " vshll.u8 q8, d2, #8 @ expand first red for repacking \n"
1539 " vshll.u8 q10, d1, #8 @ expand first green for repacking \n"
1540 " vshll.u8 q11, d0, #8 @ expand first blue for repacking \n"
1541 " vshll.u8 q9, d6, #8 @ expand second red for repacking \n"
1542 " vsri.u16 q8, q10, #5 @ insert first green after red \n"
1543 " vshll.u8 q10, d5, #8 @ expand second green for repacking \n"
1544 " vsri.u16 q8, q11, #11 @ insert first blue after green \n"
1545 " vshll.u8 q11, d4, #8 @ expand second blue for repacking \n"
1546 " vsri.u16 q9, q10, #5 @ insert second green after red \n"
1547 " vsri.u16 q9, q11, #11 @ insert second blue after green \n"
1548 " cmp %[count], #16 \n"
1549 " vst1.16 {d16,d17,d18,d19}, [%[dst]]! @ store 16 pixels \n"
1551 "1: @ end of main loop \n"
1552 " cmp %[count], #8 @ can we still do an 8-pixel block? \n"
1554 " sub %[count], %[count], #8 \n"
1555 " pld [%[src], %[src_stride], lsl #2] @ preload from next scanline \n"
1556 " vld4.8 {d0,d1,d2,d3}, [%[src]]! @ d3 is alpha and ignored, d2-0 are rgb. \n"
1557 " vshll.u8 q8, d2, #8 @ expand first red for repacking \n"
1558 " vshll.u8 q10, d1, #8 @ expand first green for repacking \n"
1559 " vshll.u8 q11, d0, #8 @ expand first blue for repacking \n"
1560 " vsri.u16 q8, q10, #5 @ insert first green after red \n"
1561 " vsri.u16 q8, q11, #11 @ insert first blue after green \n"
1562 " vst1.16 {d16,d17}, [%[dst]]! @ store 8 pixels \n"
1565 /* Clobbered input and working registers marked as input/outputs */
1566 : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr), [count] "+r" (count)
1568 /* Unclobbered input */
1569 : [src_stride] "r" (src_stride)
1571 /* Clobbered vector registers */
1573 /* NB: these are the quad aliases of the
1574 * double registers used in the asm
1576 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17",
1577 "d18", "d19", "d20", "d21", "d22", "d23", "cc", "memory"
1580 /* A copy of the above code, in intrinsics-form. */
1583 uint8x8x4_t pixel_set_a, pixel_set_b;
1584 uint16x8_t red_a, green_a, blue_a;
1585 uint16x8_t red_b, green_b, blue_b;
1586 uint16x8_t dest_pixels_a, dest_pixels_b;
1589 __pld (src_ptr + src_stride);
1590 pixel_set_a = vld4_u8 ((uint8_t*)(src_ptr));
1591 pixel_set_b = vld4_u8 ((uint8_t*)(src_ptr + 8));
1594 red_a = vshll_n_u8 (pixel_set_a.val[2], 8);
1595 green_a = vshll_n_u8 (pixel_set_a.val[1], 8);
1596 blue_a = vshll_n_u8 (pixel_set_a.val[0], 8);
1598 red_b = vshll_n_u8 (pixel_set_b.val[2], 8);
1599 green_b = vshll_n_u8 (pixel_set_b.val[1], 8);
1600 blue_b = vshll_n_u8 (pixel_set_b.val[0], 8);
1602 dest_pixels_a = vsriq_n_u16 (red_a, green_a, 5);
1603 dest_pixels_b = vsriq_n_u16 (red_b, green_b, 5);
1605 dest_pixels_a = vsriq_n_u16 (dest_pixels_a, blue_a, 11);
1606 dest_pixels_b = vsriq_n_u16 (dest_pixels_b, blue_b, 11);
1608 /* There doesn't seem to be an intrinsic for the
1609 * double-quadword variant
1611 vst1q_u16 (dst_ptr, dest_pixels_a);
1612 vst1q_u16 (dst_ptr + 8, dest_pixels_b);
1619 uint8x8x4_t pixel_set_a;
1620 uint16x8_t red_a, green_a, blue_a;
1621 uint16x8_t dest_pixels_a;
1623 __pld (src_ptr + src_stride);
1625 pixel_set_a = vld4_u8 ((uint8_t*)(src_ptr));
1628 red_a = vshll_n_u8 (pixel_set_a.val[2], 8);
1629 green_a = vshll_n_u8 (pixel_set_a.val[1], 8);
1630 blue_a = vshll_n_u8 (pixel_set_a.val[0], 8);
1632 dest_pixels_a = vsriq_n_u16 (red_a, green_a, 5);
1633 dest_pixels_a = vsriq_n_u16 (dest_pixels_a, blue_a, 11);
1635 vst1q_u16 (dst_ptr, dest_pixels_a);
1639 #endif /* USE_GCC_INLINE_ASM */
1643 __pld (src_ptr + src_stride);
1647 uint32_t src_pixel_a = *src_ptr++;
1648 uint32_t src_pixel_b = *src_ptr++;
1650 /* ARM is really good at shift-then-ALU ops. */
1651 /* This should be a total of six shift-ANDs and five shift-ORs. */
1652 uint32_t dst_pixels_a;
1653 uint32_t dst_pixels_b;
1655 dst_pixels_a = ((src_pixel_a >> 3) & rb_mask);
1656 dst_pixels_a |= ((src_pixel_a >> 10) & g_mask) << 5;
1657 dst_pixels_a |= ((src_pixel_a >> 19) & rb_mask) << 11;
1659 dst_pixels_b = ((src_pixel_b >> 3) & rb_mask);
1660 dst_pixels_b |= ((src_pixel_b >> 10) & g_mask) << 5;
1661 dst_pixels_b |= ((src_pixel_b >> 19) & rb_mask) << 11;
1663 /* little-endian mode only */
1664 *((uint32_t*) dst_ptr) = dst_pixels_a | (dst_pixels_b << 16);
1671 uint32_t src_pixel = *src_ptr++;
1673 /* ARM is really good at shift-then-ALU ops.
1674 * This block should end up as three shift-ANDs
1675 * and two shift-ORs.
1677 uint32_t tmp_blue = (src_pixel >> 3) & rb_mask;
1678 uint32_t tmp_green = (src_pixel >> 10) & g_mask;
1679 uint32_t tmp_red = (src_pixel >> 19) & rb_mask;
1680 uint16_t dst_pixel = (tmp_red << 11) | (tmp_green << 5) | tmp_blue;
1682 *dst_ptr++ = dst_pixel;
1686 src_line += src_stride;
1687 dst_line += dst_stride;
1691 static pixman_bool_t
1692 pixman_fill_neon (uint32_t *bits,
1701 uint32_t byte_stride, color;
1704 /* stride is always multiple of 32bit units in pixman */
1705 byte_stride = stride * sizeof(uint32_t);
1710 dst = ((char *) bits) + y * byte_stride + x;
1712 color = _xor << 24 | _xor << 16 | _xor << 8 | _xor;
1716 dst = ((char *) bits) + y * byte_stride + x * 2;
1718 color = _xor << 16 | _xor;
1719 width *= 2; /* width to bytes */
1723 dst = ((char *) bits) + y * byte_stride + x * 4;
1725 width *= 4; /* width to bytes */
1732 #ifdef USE_GCC_INLINE_ASM
1735 /* We have a special case for such small widths that don't allow
1736 * us to use wide 128-bit stores anyway. We don't waste time
1737 * trying to align writes, since there are only very few of them anyway
1740 "cmp %[height], #0\n"/* Check if empty fill */
1742 "vdup.32 d0, %[color]\n"/* Fill the color to neon req */
1744 /* Check if we have a such width that can easily be handled by single
1745 * operation for each scanline. This significantly reduces the number
1746 * of test/branch instructions for each scanline
1748 "cmp %[width], #8\n"
1750 "cmp %[width], #4\n"
1752 "cmp %[width], #2\n"
1755 /* Loop starts here for each scanline */
1757 "mov r4, %[dst]\n" /* Starting address of the current line */
1758 "tst %[width], #8\n"
1760 "vst1.8 {d0}, [r4]!\n"
1762 "tst %[width], #4\n"
1764 "str %[color], [r4], #4\n"
1766 "tst %[width], #2\n"
1768 "strh %[color], [r4], #2\n"
1770 "tst %[width], #1\n"
1772 "strb %[color], [r4], #1\n"
1775 "subs %[height], %[height], #1\n"
1776 "add %[dst], %[dst], %[byte_stride]\n"
1780 /* Special fillers for those widths that we can do with single operation */
1782 "subs %[height], %[height], #1\n"
1783 "vst1.8 {d0}, [%[dst]]\n"
1784 "add %[dst], %[dst], %[byte_stride]\n"
1789 "subs %[height], %[height], #1\n"
1790 "str %[color], [%[dst]]\n"
1791 "add %[dst], %[dst], %[byte_stride]\n"
1796 "subs %[height], %[height], #1\n"
1797 "strh %[color], [%[dst]]\n"
1798 "add %[dst], %[dst], %[byte_stride]\n"
1803 : /* No output members */
1804 : [color] "r" (color), [height] "r" (height), [width] "r" (width),
1805 [dst] "r" (dst), [byte_stride] "r" (byte_stride)
1806 : "memory", "cc", "d0", "r4", "r5");
1811 "cmp %[height], #0\n"/* Check if empty fill */
1813 "vdup.32 q0, %[color]\n"/* Fill the color to neon req */
1815 /* Loop starts here for each scanline */
1817 "mov r4, %[dst]\n"/* Starting address of the current line */
1818 "mov r5, %[width]\n"/* We're going to write this many bytes */
1819 "ands r6, r4, #15\n"/* Are we at the 128-bit aligned address? */
1820 "beq 2f\n"/* Jump to the best case */
1822 /* We're not 128-bit aligned: However, we know that we can get to the
1823 next aligned location, since the fill is at least 16 bytes wide */
1824 "rsb r6, r6, #16\n" /* We would need to go forward this much */
1825 "sub r5, r5, r6\n"/* Update bytes left */
1828 "vst1.8 {d0[0]}, [r4]!\n"/* Store byte, now we are word aligned */
1832 "vst1.16 {d0[0]}, [r4, :16]!\n"/* Store half word, now we are 16-bit aligned */
1836 "vst1.32 {d0[0]}, [r4, :32]!\n"/* Store word, now we're 32-bit aligned */
1840 "vst1.64 {d0}, [r4, :64]!\n"/* Store qword now we're 64-bit aligned */
1842 /* The good case: We're 128-bit aligned for this scanline */
1844 "and r6, r5, #15\n"/* Number of tailing bytes */
1845 "cmp r5, r6\n"/* Do we have at least one qword to write? */
1846 "beq 6f\n"/* No, we just write the tail */
1847 "lsr r5, r5, #4\n"/* This many full qwords to write */
1849 /* The main block: Do 128-bit aligned writes */
1852 "vst1.64 {d0,d1}, [r4, :128]!\n"
1855 /* Handle the tailing bytes: Do 64, 32, 16 and 8-bit aligned writes as needed.
1856 We know that we're currently at 128-bit aligned address, so we can just
1857 pick the biggest operations that the remaining write width allows */
1863 "vst1.64 {d0}, [r4, :64]!\n"
1867 "vst1.32 {d0[0]}, [r4, :32]!\n"
1871 "vst1.16 {d0[0]}, [r4, :16]!\n"
1875 "vst1.8 {d0[0]}, [r4]!\n"
1878 /* Handle the next scanline */
1879 "subs %[height], %[height], #1\n"
1880 "add %[dst], %[dst], %[byte_stride]\n"
1883 : /* No output members */
1884 : [color] "r" (color), [height] "r" (height), [width] "r" (width),
1885 [dst] "r" (dst), [byte_stride] "r" (byte_stride)
1886 : "memory", "cc", "q0", "d0", "d1", "r4", "r5", "r6");
1892 /* TODO: intrinsic version for armcc */
1898 /* TODO: is there a more generic way of doing this being introduced? */
1899 #define NEON_SCANLINE_BUFFER_PIXELS (1024)
1902 neon_quadword_copy (void* dst,
1904 uint32_t count, /* of quadwords */
1905 uint32_t trailer_count /* of bytes */)
1907 uint8_t *t_dst = dst, *t_src = src;
1909 /* Uses aligned multi-register loads to maximise read bandwidth
1910 * on uncached memory such as framebuffers
1911 * The accesses do not have the aligned qualifiers, so that the copy
1912 * may convert between aligned-uncached and unaligned-cached memory.
1913 * It is assumed that the CPU can infer alignedness from the address.
1916 #ifdef USE_GCC_INLINE_ASM
1919 " cmp %[count], #8 \n"
1920 " blt 1f @ skip oversized fragments \n"
1921 "0: @ start with eight quadwords at a time \n"
1922 " sub %[count], %[count], #8 \n"
1923 " vld1.8 {d16,d17,d18,d19}, [%[src]]! \n"
1924 " vld1.8 {d20,d21,d22,d23}, [%[src]]! \n"
1925 " vld1.8 {d24,d25,d26,d27}, [%[src]]! \n"
1926 " vld1.8 {d28,d29,d30,d31}, [%[src]]! \n"
1927 " cmp %[count], #8 \n"
1928 " vst1.8 {d16,d17,d18,d19}, [%[dst]]! \n"
1929 " vst1.8 {d20,d21,d22,d23}, [%[dst]]! \n"
1930 " vst1.8 {d24,d25,d26,d27}, [%[dst]]! \n"
1931 " vst1.8 {d28,d29,d30,d31}, [%[dst]]! \n"
1933 "1: @ four quadwords \n"
1934 " tst %[count], #4 \n"
1935 " beq 2f @ skip oversized fragment \n"
1936 " vld1.8 {d16,d17,d18,d19}, [%[src]]! \n"
1937 " vld1.8 {d20,d21,d22,d23}, [%[src]]! \n"
1938 " vst1.8 {d16,d17,d18,d19}, [%[dst]]! \n"
1939 " vst1.8 {d20,d21,d22,d23}, [%[dst]]! \n"
1940 "2: @ two quadwords \n"
1941 " tst %[count], #2 \n"
1942 " beq 3f @ skip oversized fragment \n"
1943 " vld1.8 {d16,d17,d18,d19}, [%[src]]! \n"
1944 " vst1.8 {d16,d17,d18,d19}, [%[dst]]! \n"
1945 "3: @ one quadword \n"
1946 " tst %[count], #1 \n"
1947 " beq 4f @ skip oversized fragment \n"
1948 " vld1.8 {d16,d17}, [%[src]]! \n"
1949 " vst1.8 {d16,d17}, [%[dst]]! \n"
1952 /* Clobbered input registers marked as input/outputs */
1953 : [dst] "+r" (t_dst), [src] "+r" (t_src), [count] "+r" (count)
1955 /* No unclobbered inputs */
1958 /* Clobbered vector registers */
1959 : "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25",
1960 "d26", "d27", "d28", "d29", "d30", "d31", "cc", "memory");
1966 uint8x16x4_t t1 = vld4q_u8 (t_src);
1967 uint8x16x4_t t2 = vld4q_u8 (t_src + sizeof(uint8x16x4_t));
1969 t_src += sizeof(uint8x16x4_t) * 2;
1970 vst4q_u8 (t_dst, t1);
1971 vst4q_u8 (t_dst + sizeof(uint8x16x4_t), t2);
1972 t_dst += sizeof(uint8x16x4_t) * 2;
1978 uint8x16x4_t t1 = vld4q_u8 (t_src);
1980 t_src += sizeof(uint8x16x4_t);
1981 vst4q_u8 (t_dst, t1);
1982 t_dst += sizeof(uint8x16x4_t);
1987 uint8x8x4_t t1 = vld4_u8 (t_src);
1989 t_src += sizeof(uint8x8x4_t);
1990 vst4_u8 (t_dst, t1);
1991 t_dst += sizeof(uint8x8x4_t);
1996 uint8x16_t t1 = vld1q_u8 (t_src);
1998 t_src += sizeof(uint8x16_t);
1999 vst1q_u8 (t_dst, t1);
2000 t_dst += sizeof(uint8x16_t);
2003 #endif /* !USE_GCC_INLINE_ASM */
2007 if (trailer_count & 8)
2009 uint8x8_t t1 = vld1_u8 (t_src);
2011 t_src += sizeof(uint8x8_t);
2012 vst1_u8 (t_dst, t1);
2013 t_dst += sizeof(uint8x8_t);
2016 if (trailer_count & 4)
2018 *((uint32_t*) t_dst) = *((uint32_t*) t_src);
2024 if (trailer_count & 2)
2026 *((uint16_t*) t_dst) = *((uint16_t*) t_src);
2032 if (trailer_count & 1)
2034 *t_dst++ = *t_src++;
2040 solid_over_565_8_pix_neon (uint32_t glyph_colour,
2043 uint32_t dest_stride, /* bytes, not elements */
2044 uint32_t mask_stride,
2045 uint32_t count /* 8-pixel groups */)
2047 /* Inner loop of glyph blitter (solid colour, alpha mask) */
2049 #ifdef USE_GCC_INLINE_ASM
2052 " vld4.8 {d20[],d21[],d22[],d23[]}, [%[glyph_colour]] @ splat solid colour components \n"
2054 " vld1.16 {d0,d1}, [%[dest]] @ load first pixels from framebuffer \n"
2055 " vld1.8 {d17}, [%[in_mask]] @ load alpha mask of glyph \n"
2056 " vmull.u8 q9, d17, d23 @ apply glyph colour alpha to mask \n"
2057 " vshrn.u16 d17, q9, #8 @ reformat it to match original mask \n"
2058 " vmvn d18, d17 @ we need the inverse mask for the background \n"
2059 " vsli.u16 q3, q0, #5 @ duplicate framebuffer blue bits \n"
2060 " vshrn.u16 d2, q0, #8 @ unpack red from framebuffer pixels \n"
2061 " vshrn.u16 d4, q0, #3 @ unpack green \n"
2062 " vsri.u8 d2, d2, #5 @ duplicate red bits (extend 5 to 8) \n"
2063 " vshrn.u16 d6, q3, #2 @ unpack extended blue (truncate 10 to 8) \n"
2064 " vsri.u8 d4, d4, #6 @ duplicate green bits (extend 6 to 8) \n"
2065 " vmull.u8 q1, d2, d18 @ apply inverse mask to background red... \n"
2066 " vmull.u8 q2, d4, d18 @ ...green... \n"
2067 " vmull.u8 q3, d6, d18 @ ...blue \n"
2068 " subs %[count], %[count], #1 @ decrement/test loop counter \n"
2069 " vmlal.u8 q1, d17, d22 @ add masked foreground red... \n"
2070 " vmlal.u8 q2, d17, d21 @ ...green... \n"
2071 " vmlal.u8 q3, d17, d20 @ ...blue \n"
2072 " add %[in_mask], %[in_mask], %[mask_stride] @ advance mask pointer, while we wait \n"
2073 " vsri.16 q1, q2, #5 @ pack green behind red \n"
2074 " vsri.16 q1, q3, #11 @ pack blue into pixels \n"
2075 " vst1.16 {d2,d3}, [%[dest]] @ store composited pixels \n"
2076 " add %[dest], %[dest], %[dest_stride] @ advance framebuffer pointer \n"
2077 " bne 0b @ next please \n"
2079 /* Clobbered registers marked as input/outputs */
2080 : [dest] "+r" (dest), [in_mask] "+r" (in_mask), [count] "+r" (count)
2083 : [dest_stride] "r" (dest_stride), [mask_stride] "r" (mask_stride), [glyph_colour] "r" (&glyph_colour)
2085 /* Clobbers, including the inputs we modify, and potentially lots of memory */
2086 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d17", "d18", "d19",
2087 "d20", "d21", "d22", "d23", "d24", "d25", "cc", "memory"
2092 uint8x8x4_t solid_colour = vld4_dup_u8 ((uint8_t*) &glyph_colour);
2096 uint16x8_t pixels = vld1q_u16 (dest);
2097 uint8x8_t mask = vshrn_n_u16 (vmull_u8 (solid_colour.val[3], vld1_u8 (in_mask)), 8);
2098 uint8x8_t mask_image = vmvn_u8 (mask);
2100 uint8x8_t t_red = vshrn_n_u16 (pixels, 8);
2101 uint8x8_t t_green = vshrn_n_u16 (pixels, 3);
2102 uint8x8_t t_blue = vshrn_n_u16 (vsli_n_u8 (pixels, pixels, 5), 2);
2104 uint16x8_t s_red = vmull_u8 (vsri_n_u8 (t_red, t_red, 5), mask_image);
2105 uint16x8_t s_green = vmull_u8 (vsri_n_u8 (t_green, t_green, 6), mask_image);
2106 uint16x8_t s_blue = vmull_u8 (t_blue, mask_image);
2108 s_red = vmlal (s_red, mask, solid_colour.val[2]);
2109 s_green = vmlal (s_green, mask, solid_colour.val[1]);
2110 s_blue = vmlal (s_blue, mask, solid_colour.val[0]);
2112 pixels = vsri_n_u16 (s_red, s_green, 5);
2113 pixels = vsri_n_u16 (pixels, s_blue, 11);
2114 vst1q_u16 (dest, pixels);
2116 dest += dest_stride;
2117 mask += mask_stride;
2123 #if 0 /* this is broken currently */
2125 neon_composite_over_n_8_0565 (pixman_implementation_t * impl,
2127 pixman_image_t * src_image,
2128 pixman_image_t * mask_image,
2129 pixman_image_t * dst_image,
2140 uint16_t *dst_line, *aligned_line;
2142 uint32_t dst_stride, mask_stride;
2143 uint32_t kernel_count, copy_count, copy_tail;
2144 uint8_t kernel_offset, copy_offset;
2146 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2148 /* bail out if fully transparent or degenerate */
2153 if (width == 0 || height == 0)
2156 if (width > NEON_SCANLINE_BUFFER_PIXELS)
2158 /* split the blit, so we can use a fixed-size scanline buffer
2159 * TODO: there must be a more elegant way of doing this.
2162 for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
2164 neon_composite_over_n_8_0565 (
2166 src_image, mask_image, dst_image,
2167 src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
2168 (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
2174 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2175 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2177 /* keep within minimum number of aligned quadwords on width
2178 * while also keeping the minimum number of columns to process
2181 unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
2182 unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
2183 unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
2185 /* the fast copy should be quadword aligned */
2186 copy_offset = dst_line - ((uint16_t*) aligned_left);
2187 aligned_line = dst_line - copy_offset;
2188 copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
2191 if (aligned_right - aligned_left > ceiling_length)
2193 /* unaligned routine is tightest */
2194 kernel_count = (uint32_t) (ceiling_length >> 4);
2195 kernel_offset = copy_offset;
2199 /* aligned routine is equally tight, so it is safer to align */
2200 kernel_count = copy_count;
2204 /* We should avoid reading beyond scanline ends for safety */
2205 if (aligned_line < (dst_line - dest_x) ||
2206 (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
2208 /* switch to precise read */
2209 copy_offset = kernel_offset = 0;
2210 aligned_line = dst_line;
2211 kernel_count = (uint32_t) (ceiling_length >> 4);
2212 copy_count = (width * sizeof(*dst_line)) >> 4;
2213 copy_tail = (width * sizeof(*dst_line)) & 0xF;
2218 uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8]; /* deliberately not initialised */
2219 uint8_t glyph_line[NEON_SCANLINE_BUFFER_PIXELS + 8];
2222 /* row-major order */
2223 /* left edge, middle block, right edge */
2224 for ( ; y--; mask_line += mask_stride, aligned_line += dst_stride, dst_line += dst_stride)
2226 /* We don't want to overrun the edges of the glyph,
2227 * so realign the edge data into known buffers
2229 neon_quadword_copy (glyph_line + copy_offset, mask_line, width >> 4, width & 0xF);
2231 /* Uncached framebuffer access is really, really slow
2232 * if we do it piecemeal. It should be much faster if we
2233 * grab it all at once. One scanline should easily fit in
2234 * L1 cache, so this should not waste RAM bandwidth.
2236 neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
2238 /* Apply the actual filter */
2239 solid_over_565_8_pix_neon (
2240 src, scan_line + kernel_offset,
2241 glyph_line + kernel_offset, 8 * sizeof(*dst_line),
2244 /* Copy the modified scanline back */
2245 neon_quadword_copy (dst_line, scan_line + copy_offset,
2246 width >> 3, (width & 7) * 2);
2252 #ifdef USE_GCC_INLINE_ASM
2255 plain_over_565_8_pix_neon (uint32_t colour,
2257 uint32_t dest_stride, /* bytes, not elements */
2258 uint32_t count /* 8-pixel groups */)
2260 /* Inner loop for plain translucent rects
2261 * (solid colour without alpha mask)
2264 " vld4.8 {d20[],d21[],d22[],d23[]}, [%[colour]] @ solid colour load/splat \n"
2265 " vmull.u8 q12, d23, d22 @ premultiply alpha red \n"
2266 " vmull.u8 q13, d23, d21 @ premultiply alpha green \n"
2267 " vmull.u8 q14, d23, d20 @ premultiply alpha blue \n"
2268 " vmvn d18, d23 @ inverse alpha for background \n"
2270 " vld1.16 {d0,d1}, [%[dest]] @ load first pixels from framebuffer \n"
2271 " vshrn.u16 d2, q0, #8 @ unpack red from framebuffer pixels \n"
2272 " vshrn.u16 d4, q0, #3 @ unpack green \n"
2273 " vsli.u16 q3, q0, #5 @ duplicate framebuffer blue bits \n"
2274 " vsri.u8 d2, d2, #5 @ duplicate red bits (extend 5 to 8) \n"
2275 " vsri.u8 d4, d4, #6 @ duplicate green bits (extend 6 to 8) \n"
2276 " vshrn.u16 d6, q3, #2 @ unpack extended blue (truncate 10 to 8) \n"
2277 " vmov q0, q12 @ retrieve foreground red \n"
2278 " vmlal.u8 q0, d2, d18 @ blend red - my kingdom for a four-operand MLA \n"
2279 " vmov q1, q13 @ retrieve foreground green \n"
2280 " vmlal.u8 q1, d4, d18 @ blend green \n"
2281 " vmov q2, q14 @ retrieve foreground blue \n"
2282 " vmlal.u8 q2, d6, d18 @ blend blue \n"
2283 " subs %[count], %[count], #1 @ decrement/test loop counter \n"
2284 " vsri.16 q0, q1, #5 @ pack green behind red \n"
2285 " vsri.16 q0, q2, #11 @ pack blue into pixels \n"
2286 " vst1.16 {d0,d1}, [%[dest]] @ store composited pixels \n"
2287 " add %[dest], %[dest], %[dest_stride] @ advance framebuffer pointer \n"
2288 " bne 0b @ next please \n"
2290 /* Clobbered registers marked as input/outputs */
2291 : [dest] "+r" (dest), [count] "+r" (count)
2294 : [dest_stride] "r" (dest_stride), [colour] "r" (&colour)
2296 /* Clobbers, including the inputs we modify, and
2297 * potentially lots of memory
2299 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d18", "d19",
2300 "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
2306 neon_composite_over_n_0565 (pixman_implementation_t * impl,
2308 pixman_image_t * src_image,
2309 pixman_image_t * mask_image,
2310 pixman_image_t * dst_image,
2321 uint16_t *dst_line, *aligned_line;
2322 uint32_t dst_stride;
2323 uint32_t kernel_count, copy_count, copy_tail;
2324 uint8_t kernel_offset, copy_offset;
2326 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2328 /* bail out if fully transparent */
2333 if (width == 0 || height == 0)
2336 if (width > NEON_SCANLINE_BUFFER_PIXELS)
2338 /* split the blit, so we can use a fixed-size scanline buffer *
2339 * TODO: there must be a more elegant way of doing this.
2343 for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
2345 neon_composite_over_n_0565 (
2347 src_image, mask_image, dst_image,
2348 src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
2349 (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
2354 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2356 /* keep within minimum number of aligned quadwords on width
2357 * while also keeping the minimum number of columns to process
2360 unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
2361 unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
2362 unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
2364 /* the fast copy should be quadword aligned */
2365 copy_offset = dst_line - ((uint16_t*) aligned_left);
2366 aligned_line = dst_line - copy_offset;
2367 copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
2370 if (aligned_right - aligned_left > ceiling_length)
2372 /* unaligned routine is tightest */
2373 kernel_count = (uint32_t) (ceiling_length >> 4);
2374 kernel_offset = copy_offset;
2378 /* aligned routine is equally tight, so it is safer to align */
2379 kernel_count = copy_count;
2383 /* We should avoid reading beyond scanline ends for safety */
2384 if (aligned_line < (dst_line - dest_x) ||
2385 (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
2387 /* switch to precise read */
2388 copy_offset = kernel_offset = 0;
2389 aligned_line = dst_line;
2390 kernel_count = (uint32_t) (ceiling_length >> 4);
2391 copy_count = (width * sizeof(*dst_line)) >> 4;
2392 copy_tail = (width * sizeof(*dst_line)) & 0xF;
2397 uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8]; /* deliberately not initialised */
2399 /* row-major order */
2400 /* left edge, middle block, right edge */
2401 for ( ; height--; aligned_line += dst_stride, dst_line += dst_stride)
2403 /* Uncached framebuffer access is really, really slow if we do it piecemeal.
2404 * It should be much faster if we grab it all at once.
2405 * One scanline should easily fit in L1 cache, so this should
2406 * not waste RAM bandwidth.
2408 neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
2410 /* Apply the actual filter */
2411 plain_over_565_8_pix_neon (
2412 src, scan_line + kernel_offset, 8 * sizeof(*dst_line), kernel_count);
2414 /* Copy the modified scanline back */
2415 neon_quadword_copy (
2416 dst_line, scan_line + copy_offset, width >> 3, (width & 7) * 2);
2422 ARGB8_over_565_8_pix_neon (uint32_t *src,
2424 uint32_t src_stride, /* bytes, not elements */
2425 uint32_t count /* 8-pixel groups */)
2429 " pld [%[src], %[src_stride]] @ preload from next scanline \n"
2430 " vld1.16 {d0,d1}, [%[dest]] @ load pixels from framebuffer \n"
2431 " vld4.8 {d20,d21,d22,d23},[%[src]]! @ load source image pixels \n"
2432 " vsli.u16 q3, q0, #5 @ duplicate framebuffer blue bits \n"
2433 " vshrn.u16 d2, q0, #8 @ unpack red from framebuffer pixels \n"
2434 " vshrn.u16 d4, q0, #3 @ unpack green \n"
2435 " vmvn d18, d23 @ we need the inverse alpha for the background \n"
2436 " vsri.u8 d2, d2, #5 @ duplicate red bits (extend 5 to 8) \n"
2437 " vshrn.u16 d6, q3, #2 @ unpack extended blue (truncate 10 to 8) \n"
2438 " vsri.u8 d4, d4, #6 @ duplicate green bits (extend 6 to 8) \n"
2439 " vmull.u8 q1, d2, d18 @ apply inverse alpha to background red... \n"
2440 " vmull.u8 q2, d4, d18 @ ...green... \n"
2441 " vmull.u8 q3, d6, d18 @ ...blue \n"
2442 " subs %[count], %[count], #1 @ decrement/test loop counter \n"
2443 " vmlal.u8 q1, d23, d22 @ add blended foreground red... \n"
2444 " vmlal.u8 q2, d23, d21 @ ...green... \n"
2445 " vmlal.u8 q3, d23, d20 @ ...blue \n"
2446 " vsri.16 q1, q2, #5 @ pack green behind red \n"
2447 " vsri.16 q1, q3, #11 @ pack blue into pixels \n"
2448 " vst1.16 {d2,d3}, [%[dest]]! @ store composited pixels \n"
2449 " bne 0b @ next please \n"
2451 /* Clobbered registers marked as input/outputs */
2452 : [dest] "+r" (dest), [src] "+r" (src), [count] "+r" (count)
2455 : [src_stride] "r" (src_stride)
2457 /* Clobbers, including the inputs we modify, and potentially lots of memory */
2458 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d17", "d18", "d20",
2459 "d21", "d22", "d23", "cc", "memory"
2464 neon_composite_over_8888_0565 (pixman_implementation_t * impl,
2466 pixman_image_t * src_image,
2467 pixman_image_t * mask_image,
2468 pixman_image_t * dst_image,
2479 uint16_t *dst_line, *aligned_line;
2480 uint32_t dst_stride, src_stride;
2481 uint32_t kernel_count, copy_count, copy_tail;
2482 uint8_t kernel_offset, copy_offset;
2484 /* we assume mask is opaque
2485 * so the only alpha to deal with is embedded in src
2487 if (width > NEON_SCANLINE_BUFFER_PIXELS)
2489 /* split the blit, so we can use a fixed-size scanline buffer */
2491 for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
2493 neon_composite_over_8888_0565 (
2495 src_image, mask_image, dst_image,
2496 src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
2497 (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
2502 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2503 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2505 /* keep within minimum number of aligned quadwords on width
2506 * while also keeping the minimum number of columns to process
2509 unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
2510 unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
2511 unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
2513 /* the fast copy should be quadword aligned */
2514 copy_offset = dst_line - ((uint16_t*) aligned_left);
2515 aligned_line = dst_line - copy_offset;
2516 copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
2519 if (aligned_right - aligned_left > ceiling_length)
2521 /* unaligned routine is tightest */
2522 kernel_count = (uint32_t) (ceiling_length >> 4);
2523 kernel_offset = copy_offset;
2527 /* aligned routine is equally tight, so it is safer to align */
2528 kernel_count = copy_count;
2532 /* We should avoid reading beyond scanline ends for safety */
2533 if (aligned_line < (dst_line - dest_x) ||
2534 (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
2536 /* switch to precise read */
2537 copy_offset = kernel_offset = 0;
2538 aligned_line = dst_line;
2539 kernel_count = (uint32_t) (ceiling_length >> 4);
2540 copy_count = (width * sizeof(*dst_line)) >> 4;
2541 copy_tail = (width * sizeof(*dst_line)) & 0xF;
2545 /* Preload the first input scanline */
2547 uint8_t *src_ptr = (uint8_t*) src_line;
2548 uint32_t count = (width + 15) / 16;
2550 #ifdef USE_GCC_INLINE_ASM
2553 " subs %[count], %[count], #1 \n"
2555 " add %[src], %[src], #64 \n"
2558 /* Clobbered input registers marked as input/outputs */
2559 : [src] "+r" (src_ptr), [count] "+r" (count)
2560 : /* no unclobbered inputs */
2574 uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8]; /* deliberately not initialised */
2576 /* row-major order */
2577 /* left edge, middle block, right edge */
2578 for ( ; height--; src_line += src_stride, aligned_line += dst_stride)
2580 /* Uncached framebuffer access is really, really slow if we do
2581 * it piecemeal. It should be much faster if we grab it all at
2582 * once. One scanline should easily fit in L1 cache, so this
2583 * should not waste RAM bandwidth.
2585 neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
2587 /* Apply the actual filter */
2588 ARGB8_over_565_8_pix_neon (
2589 src_line, scan_line + kernel_offset,
2590 src_stride * sizeof(*src_line), kernel_count);
2592 /* Copy the modified scanline back */
2593 neon_quadword_copy (dst_line,
2594 scan_line + copy_offset,
2595 width >> 3, (width & 7) * 2);
2600 #endif /* USE_GCC_INLINE_ASM */
2602 static const pixman_fast_path_t arm_neon_fast_path_array[] =
2604 { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, neon_composite_add_8888_8_8, 0 },
2605 { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, neon_composite_add_8000_8000, 0 },
2606 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, neon_composite_over_n_8_0565, 0 },
2607 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, neon_composite_over_n_8_0565, 0 },
2608 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_src_24_16, 0 },
2609 { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_src_24_16, 0 },
2610 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_src_24_16, 0 },
2611 { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_src_24_16, 0 },
2612 #ifdef USE_GCC_INLINE_ASM
2613 { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_src_16_16, 0 },
2614 { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_src_16_16, 0 },
2615 #if 0 /* this code has some bugs */
2616 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_over_n_0565, 0 },
2617 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_over_n_0565, 0 },
2618 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_over_8888_0565, 0 },
2619 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_over_8888_0565, 0 },
2622 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, neon_composite_over_8888_8888, 0 },
2623 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, neon_composite_over_8888_8888, 0 },
2624 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, neon_composite_over_8888_8888, 0 },
2625 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, neon_composite_over_8888_8888, 0 },
2626 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
2627 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
2628 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, neon_composite_over_n_8_8888, 0 },
2629 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, neon_composite_over_n_8_8888, 0 },
2630 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, neon_composite_over_n_8_8888, 0 },
2631 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, neon_composite_over_n_8_8888, 0 },
2635 const pixman_fast_path_t *const arm_neon_fast_paths = arm_neon_fast_path_array;
2638 arm_neon_composite (pixman_implementation_t *imp,
2640 pixman_image_t * src,
2641 pixman_image_t * mask,
2642 pixman_image_t * dest,
2652 if (_pixman_run_fast_path (arm_neon_fast_paths, imp,
2653 op, src, mask, dest,
2662 _pixman_implementation_composite (imp->delegate, op,
2670 static pixman_bool_t
2671 pixman_blt_neon (void *src_bits,
2684 if (!width || !height)
2687 /* accelerate only straight copies involving complete bytes */
2688 if (src_bpp != dst_bpp || (src_bpp & 7))
2692 uint32_t bytes_per_pixel = src_bpp >> 3;
2693 uint32_t byte_width = width * bytes_per_pixel;
2694 /* parameter is in words for some reason */
2695 int32_t src_stride_bytes = src_stride * 4;
2696 int32_t dst_stride_bytes = dst_stride * 4;
2697 uint8_t *src_bytes = ((uint8_t*) src_bits) +
2698 src_y * src_stride_bytes + src_x * bytes_per_pixel;
2699 uint8_t *dst_bytes = ((uint8_t*) dst_bits) +
2700 dst_y * dst_stride_bytes + dst_x * bytes_per_pixel;
2701 uint32_t quadword_count = byte_width / 16;
2702 uint32_t offset = byte_width % 16;
2706 neon_quadword_copy (dst_bytes, src_bytes, quadword_count, offset);
2707 src_bytes += src_stride_bytes;
2708 dst_bytes += dst_stride_bytes;
2715 static pixman_bool_t
2716 arm_neon_blt (pixman_implementation_t *imp,
2717 uint32_t * src_bits,
2718 uint32_t * dst_bits,
2730 if (pixman_blt_neon (
2731 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
2732 src_x, src_y, dst_x, dst_y, width, height))
2737 return _pixman_implementation_blt (
2739 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
2740 src_x, src_y, dst_x, dst_y, width, height);
2743 static pixman_bool_t
2744 arm_neon_fill (pixman_implementation_t *imp,
2754 if (pixman_fill_neon (bits, stride, bpp, x, y, width, height, xor))
2757 return _pixman_implementation_fill (
2758 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
2761 pixman_implementation_t *
2762 _pixman_implementation_create_arm_neon (void)
2764 pixman_implementation_t *simd = _pixman_implementation_create_arm_simd ();
2765 pixman_implementation_t *imp = _pixman_implementation_create (simd);
2767 imp->composite = arm_neon_composite;
2768 #if 0 /* this code has some bugs */
2769 imp->blt = arm_neon_blt;
2771 imp->fill = arm_neon_fill;