2 * Copyright © 2009 ARM Ltd, Movial Creative Technologies Oy
4 * Permission to use, copy, modify, distribute, and sell this software and its
5 * documentation for any purpose is hereby granted without fee, provided that
6 * the above copyright notice appear in all copies and that both that
7 * copyright notice and this permission notice appear in supporting
8 * documentation, and that the name of ARM Ltd not be used in
9 * advertising or publicity pertaining to distribution of the software without
10 * specific, written prior permission. ARM Ltd makes no
11 * representations about the suitability of this software for any purpose. It
12 * is provided "as is" without express or implied warranty.
14 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
15 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
16 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
17 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
19 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
20 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23 * Author: Ian Rickards (ian.rickards@arm.com)
24 * Author: Jonathan Morton (jonathan.morton@movial.com)
25 * Author: Markku Vire (markku.vire@movial.com)
35 #include "pixman-private.h"
37 /* Deal with an intrinsic that is defined differently in GCC */
38 #if !defined(__ARMCC_VERSION) && !defined(__pld)
39 #define __pld(_x) __builtin_prefetch (_x)
42 static force_inline uint8x8x4_t
43 unpack0565 (uint16x8_t rgb)
48 res.val[3] = vdup_n_u8 (0);
49 gb = vshrq_n_u16 (rgb, 5);
50 b = vshrq_n_u16 (rgb, 5 + 6);
52 res.val[0] = vmovn_u16 (rgb); /* get low 5 bits */
53 res.val[1] = vmovn_u16 (gb); /* get mid 6 bits */
54 res.val[2] = vmovn_u16 (b); /* get top 5 bits */
56 res.val[0] = vshl_n_u8 (res.val[0], 3); /* shift to top */
57 res.val[1] = vshl_n_u8 (res.val[1], 2); /* shift to top */
58 res.val[2] = vshl_n_u8 (res.val[2], 3); /* shift to top */
60 res.val[0] = vsri_n_u8 (res.val[0], res.val[0], 5);
61 res.val[1] = vsri_n_u8 (res.val[1], res.val[1], 6);
62 res.val[2] = vsri_n_u8 (res.val[2], res.val[2], 5);
67 #ifdef USE_GCC_INLINE_ASM
68 /* Some versions of gcc have problems with vshll_n_u8 intrinsic (Bug 23576) */
69 #define vshll_n_u8(a, n) ({ uint16x8_t r; \
70 asm ("vshll.u8 %q0, %P1, %2\n" : "=w" (r) : "w" (a), "i" (n)); r; })
73 static force_inline uint16x8_t
74 pack0565 (uint8x8x4_t s)
76 uint16x8_t rgb, val_g, val_r;
78 rgb = vshll_n_u8 (s.val[2], 8);
79 val_g = vshll_n_u8 (s.val[1], 8);
80 val_r = vshll_n_u8 (s.val[0], 8);
81 rgb = vsriq_n_u16 (rgb, val_g, 5);
82 rgb = vsriq_n_u16 (rgb, val_r, 5 + 6);
87 static force_inline uint8x8_t
88 neon2mul (uint8x8_t x,
94 tmp = vmull_u8 (x, alpha);
95 tmp2 = vrshrq_n_u16 (tmp, 8);
96 res = vraddhn_u16 (tmp, tmp2);
101 static force_inline uint8x8x4_t
102 neon8mul (uint8x8x4_t x,
107 uint16x8_t qtmp1, qtmp2;
109 tmp.val[0] = vmull_u8 (x.val[0], alpha);
110 tmp.val[1] = vmull_u8 (x.val[1], alpha);
111 tmp.val[2] = vmull_u8 (x.val[2], alpha);
112 tmp.val[3] = vmull_u8 (x.val[3], alpha);
114 qtmp1 = vrshrq_n_u16 (tmp.val[0], 8);
115 qtmp2 = vrshrq_n_u16 (tmp.val[1], 8);
116 res.val[0] = vraddhn_u16 (tmp.val[0], qtmp1);
117 qtmp1 = vrshrq_n_u16 (tmp.val[2], 8);
118 res.val[1] = vraddhn_u16 (tmp.val[1], qtmp2);
119 qtmp2 = vrshrq_n_u16 (tmp.val[3], 8);
120 res.val[2] = vraddhn_u16 (tmp.val[2], qtmp1);
121 res.val[3] = vraddhn_u16 (tmp.val[3], qtmp2);
126 static force_inline uint8x8x4_t
127 neon8qadd (uint8x8x4_t x,
132 res.val[0] = vqadd_u8 (x.val[0], y.val[0]);
133 res.val[1] = vqadd_u8 (x.val[1], y.val[1]);
134 res.val[2] = vqadd_u8 (x.val[2], y.val[2]);
135 res.val[3] = vqadd_u8 (x.val[3], y.val[3]);
141 neon_composite_add_8000_8000 (pixman_implementation_t * impl,
143 pixman_image_t * src_image,
144 pixman_image_t * mask_image,
145 pixman_image_t * dst_image,
155 uint8_t *dst_line, *dst;
156 uint8_t *src_line, *src;
157 int dst_stride, src_stride;
160 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
161 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
165 /* Use overlapping 8-pixel method */
168 uint8_t *keep_dst = 0;
169 uint8x8_t sval, dval, temp;
172 dst_line += dst_stride;
174 src_line += src_stride;
177 #ifndef USE_GCC_INLINE_ASM
178 sval = vld1_u8 ((void *)src);
179 dval = vld1_u8 ((void *)dst);
182 temp = vqadd_u8 (dval, sval);
190 sval = vld1_u8 ((void *)src);
191 dval = vld1_u8 ((void *)dst);
193 vst1_u8 ((void *)keep_dst, temp);
196 temp = vqadd_u8 (dval, sval);
203 vst1_u8 ((void *)keep_dst, temp);
206 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
207 "vld1.8 {d0}, [%[src]]\n\t"
208 "vld1.8 {d4}, [%[dst]]\n\t"
209 "mov %[keep_dst], %[dst]\n\t"
211 "and ip, %[w], #7\n\t"
212 "add %[src], %[src], ip\n\t"
213 "add %[dst], %[dst], ip\n\t"
214 "subs %[w], %[w], ip\n\t"
218 "vld1.8 {d0}, [%[src]]!\n\t"
219 "vld1.8 {d4}, [%[dst]]!\n\t"
220 "vst1.8 {d20}, [%[keep_dst]]\n\t"
221 "sub %[keep_dst], %[dst], #8\n\t"
222 "subs %[w], %[w], #8\n\t"
224 "vqadd.u8 d20, d0, d4\n\t"
229 "vst1.8 {d20}, [%[keep_dst]]\n\t"
231 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
233 : "ip", "cc", "memory", "d0", "d4",
241 const uint8_t nil = 0;
242 const uint8x8_t vnil = vld1_dup_u8 (&nil);
246 uint8x8_t sval = vnil, dval = vnil;
247 uint8_t *dst4 = 0, *dst2 = 0;
250 dst_line += dst_stride;
252 src_line += src_stride;
257 sval = vreinterpret_u8_u32 (
258 vld1_lane_u32 ((void *)src, vreinterpret_u32_u8 (sval), 1));
259 dval = vreinterpret_u8_u32 (
260 vld1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (dval), 1));
269 sval = vreinterpret_u8_u16 (
270 vld1_lane_u16 ((void *)src, vreinterpret_u16_u8 (sval), 1));
271 dval = vreinterpret_u8_u16 (
272 vld1_lane_u16 ((void *)dst, vreinterpret_u16_u8 (dval), 1));
281 sval = vld1_lane_u8 (src, sval, 1);
282 dval = vld1_lane_u8 (dst, dval, 1);
285 dval = vqadd_u8 (dval, sval);
288 vst1_lane_u8 (dst, dval, 1);
291 vst1_lane_u16 ((void *)dst2, vreinterpret_u16_u8 (dval), 1);
294 vst1_lane_u32 ((void *)dst4, vreinterpret_u32_u8 (dval), 1);
300 neon_composite_over_8888_8888 (pixman_implementation_t * impl,
302 pixman_image_t * src_image,
303 pixman_image_t * mask_image,
304 pixman_image_t * dst_image,
314 uint32_t *dst_line, *dst;
315 uint32_t *src_line, *src;
316 int dst_stride, src_stride;
319 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
320 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
324 /* Use overlapping 8-pixel method */
327 uint32_t *keep_dst = 0;
328 uint8x8x4_t sval, dval, temp;
331 dst_line += dst_stride;
333 src_line += src_stride;
336 #ifndef USE_GCC_INLINE_ASM
337 sval = vld4_u8 ((void *)src);
338 dval = vld4_u8 ((void *)dst);
341 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
342 temp = neon8qadd (sval, temp);
350 sval = vld4_u8 ((void *)src);
351 dval = vld4_u8 ((void *)dst);
353 vst4_u8 ((void *)keep_dst, temp);
356 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
357 temp = neon8qadd (sval, temp);
364 vst4_u8 ((void *)keep_dst, temp);
367 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
368 "vld4.8 {d0-d3}, [%[src]]\n\t"
369 "vld4.8 {d4-d7}, [%[dst]]\n\t"
370 "mov %[keep_dst], %[dst]\n\t"
372 "and ip, %[w], #7\n\t"
373 "add %[src], %[src], ip, LSL#2\n\t"
374 "add %[dst], %[dst], ip, LSL#2\n\t"
375 "subs %[w], %[w], ip\n\t"
379 "vld4.8 {d0-d3}, [%[src]]!\n\t"
380 "vld4.8 {d4-d7}, [%[dst]]!\n\t"
381 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
382 "sub %[keep_dst], %[dst], #8*4\n\t"
383 "subs %[w], %[w], #8\n\t"
386 "vmull.u8 q10, d31, d4\n\t"
387 "vmull.u8 q11, d31, d5\n\t"
388 "vmull.u8 q12, d31, d6\n\t"
389 "vmull.u8 q13, d31, d7\n\t"
390 "vrshr.u16 q8, q10, #8\n\t"
391 "vrshr.u16 q9, q11, #8\n\t"
392 "vraddhn.u16 d20, q10, q8\n\t"
393 "vraddhn.u16 d21, q11, q9\n\t"
394 "vrshr.u16 q8, q12, #8\n\t"
395 "vrshr.u16 q9, q13, #8\n\t"
396 "vraddhn.u16 d22, q12, q8\n\t"
397 "vraddhn.u16 d23, q13, q9\n\t"
398 /* result in d20-d23 */
399 "vqadd.u8 d20, d0, d20\n\t"
400 "vqadd.u8 d21, d1, d21\n\t"
401 "vqadd.u8 d22, d2, d22\n\t"
402 "vqadd.u8 d23, d3, d23\n\t"
407 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
409 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
411 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
412 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23"
419 uint8x8_t alpha_selector = vreinterpret_u8_u64 (
420 vcreate_u64 (0x0707070703030303ULL));
422 /* Handle width < 8 */
426 dst_line += dst_stride;
428 src_line += src_stride;
433 uint8x8_t sval, dval;
435 /* two 32-bit pixels packed into D-reg; ad-hoc vectorization */
436 sval = vreinterpret_u8_u32 (vld1_u32 ((void *)src));
437 dval = vreinterpret_u8_u32 (vld1_u32 ((void *)dst));
438 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
439 vst1_u8 ((void *)dst, vqadd_u8 (sval, dval));
448 uint8x8_t sval, dval;
450 /* single 32-bit pixel in lane 0 */
451 sval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)src)); /* only interested in lane 0 */
452 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)dst)); /* only interested in lane 0 */
453 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
454 vst1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
461 neon_composite_over_8888_n_8888 (pixman_implementation_t * impl,
463 pixman_image_t * src_image,
464 pixman_image_t * mask_image,
465 pixman_image_t * dst_image,
475 uint32_t *dst_line, *dst;
476 uint32_t *src_line, *src;
478 int dst_stride, src_stride;
480 uint8x8_t mask_alpha;
482 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
483 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
485 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
486 mask_alpha = vdup_n_u8 ((mask) >> 24);
490 /* Use overlapping 8-pixel method */
494 dst_line += dst_stride;
496 src_line += src_stride;
499 uint32_t *keep_dst = 0;
501 #ifndef USE_GCC_INLINE_ASM
502 uint8x8x4_t sval, dval, temp;
504 sval = vld4_u8 ((void *)src);
505 dval = vld4_u8 ((void *)dst);
508 sval = neon8mul (sval, mask_alpha);
509 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
510 temp = neon8qadd (sval, temp);
518 sval = vld4_u8 ((void *)src);
519 dval = vld4_u8 ((void *)dst);
521 vst4_u8 ((void *)keep_dst, temp);
524 sval = neon8mul (sval, mask_alpha);
525 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
526 temp = neon8qadd (sval, temp);
532 vst4_u8 ((void *)keep_dst, temp);
535 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
536 "vdup.32 d30, %[mask]\n\t"
537 "vdup.8 d30, d30[3]\n\t"
539 "vld4.8 {d0-d3}, [%[src]]\n\t"
540 "vld4.8 {d4-d7}, [%[dst]]\n\t"
541 "mov %[keep_dst], %[dst]\n\t"
543 "and ip, %[w], #7\n\t"
544 "add %[src], %[src], ip, LSL#2\n\t"
545 "add %[dst], %[dst], ip, LSL#2\n\t"
546 "subs %[w], %[w], ip\n\t"
550 "vld4.8 {d0-d3}, [%[src]]!\n\t"
551 "vld4.8 {d4-d7}, [%[dst]]!\n\t"
552 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
553 "sub %[keep_dst], %[dst], #8*4\n\t"
554 "subs %[w], %[w], #8\n\t"
557 "vmull.u8 q10, d30, d0\n\t"
558 "vmull.u8 q11, d30, d1\n\t"
559 "vmull.u8 q12, d30, d2\n\t"
560 "vmull.u8 q13, d30, d3\n\t"
561 "vrshr.u16 q8, q10, #8\n\t"
562 "vrshr.u16 q9, q11, #8\n\t"
563 "vraddhn.u16 d0, q10, q8\n\t"
564 "vraddhn.u16 d1, q11, q9\n\t"
565 "vrshr.u16 q9, q13, #8\n\t"
566 "vrshr.u16 q8, q12, #8\n\t"
567 "vraddhn.u16 d3, q13, q9\n\t"
568 "vraddhn.u16 d2, q12, q8\n\t"
571 "vmull.u8 q10, d31, d4\n\t"
572 "vmull.u8 q11, d31, d5\n\t"
573 "vmull.u8 q12, d31, d6\n\t"
574 "vmull.u8 q13, d31, d7\n\t"
575 "vrshr.u16 q8, q10, #8\n\t"
576 "vrshr.u16 q9, q11, #8\n\t"
577 "vraddhn.u16 d20, q10, q8\n\t"
578 "vrshr.u16 q8, q12, #8\n\t"
579 "vraddhn.u16 d21, q11, q9\n\t"
580 "vrshr.u16 q9, q13, #8\n\t"
581 "vraddhn.u16 d22, q12, q8\n\t"
582 "vraddhn.u16 d23, q13, q9\n\t"
584 /* result in d20-d23 */
585 "vqadd.u8 d20, d0, d20\n\t"
586 "vqadd.u8 d21, d1, d21\n\t"
587 "vqadd.u8 d22, d2, d22\n\t"
588 "vqadd.u8 d23, d3, d23\n\t"
593 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
595 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
597 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
598 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
606 uint8x8_t alpha_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0707070703030303ULL));
608 /* Handle width < 8 */
612 dst_line += dst_stride;
614 src_line += src_stride;
619 uint8x8_t sval, dval;
621 sval = vreinterpret_u8_u32 (vld1_u32 ((void *)src));
622 dval = vreinterpret_u8_u32 (vld1_u32 ((void *)dst));
624 /* sval * const alpha_mul */
625 sval = neon2mul (sval, mask_alpha);
627 /* dval * 255-(src alpha) */
628 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
630 vst1_u8 ((void *)dst, vqadd_u8 (sval, dval));
639 uint8x8_t sval, dval;
641 sval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)src));
642 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)dst));
644 /* sval * const alpha_mul */
645 sval = neon2mul (sval, mask_alpha);
647 /* dval * 255-(src alpha) */
648 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
650 vst1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
657 neon_composite_over_n_8_0565 (pixman_implementation_t * impl,
659 pixman_image_t * src_image,
660 pixman_image_t * mask_image,
661 pixman_image_t * dst_image,
672 uint16_t *dst_line, *dst;
673 uint8_t *mask_line, *mask;
674 int dst_stride, mask_stride;
679 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
685 sval2=vreinterpret_u8_u32 (vdup_n_u32 (src));
686 sval8.val[0]=vdup_lane_u8 (sval2,0);
687 sval8.val[1]=vdup_lane_u8 (sval2,1);
688 sval8.val[2]=vdup_lane_u8 (sval2,2);
689 sval8.val[3]=vdup_lane_u8 (sval2,3);
691 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
692 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
696 /* Use overlapping 8-pixel method, modified to avoid rewritten dest being reused */
699 uint16_t *keep_dst=0;
702 dst_line += dst_stride;
704 mask_line += mask_stride;
707 #ifndef USE_GCC_INLINE_ASM
709 uint16x8_t dval, temp;
710 uint8x8x4_t sval8temp;
712 alpha = vld1_u8 ((void *)mask);
713 dval = vld1q_u16 ((void *)dst);
716 sval8temp = neon8mul (sval8, alpha);
717 temp = pack0565 (neon8qadd (sval8temp, neon8mul (unpack0565 (dval), vmvn_u8 (sval8temp.val[3]))));
725 dval = vld1q_u16 ((void *)dst);
726 alpha = vld1_u8 ((void *)mask);
728 vst1q_u16 ((void *)keep_dst, temp);
731 sval8temp = neon8mul (sval8, alpha);
732 temp = pack0565 (neon8qadd (sval8temp, neon8mul (unpack0565 (dval), vmvn_u8 (sval8temp.val[3]))));
738 vst1q_u16 ((void *)keep_dst, temp);
741 "vdup.32 d0, %[src]\n\t"
742 "vdup.8 d1, d0[1]\n\t"
743 "vdup.8 d2, d0[2]\n\t"
744 "vdup.8 d3, d0[3]\n\t"
745 "vdup.8 d0, d0[0]\n\t"
747 "vld1.8 {q12}, [%[dst]]\n\t"
748 "vld1.8 {d31}, [%[mask]]\n\t"
749 "mov %[keep_dst], %[dst]\n\t"
751 "and ip, %[w], #7\n\t"
752 "add %[mask], %[mask], ip\n\t"
753 "add %[dst], %[dst], ip, LSL#1\n\t"
754 "subs %[w], %[w], ip\n\t"
759 "vld1.16 {q12}, [%[dst]]!\n\t"
760 "vld1.8 {d31}, [%[mask]]!\n\t"
761 "vst1.16 {q10}, [%[keep_dst]]\n\t"
762 "sub %[keep_dst], %[dst], #8*2\n\t"
763 "subs %[w], %[w], #8\n\t"
765 /* expand 0565 q12 to 8888 {d4-d7} */
766 "vmovn.u16 d4, q12\t\n"
767 "vshr.u16 q11, q12, #5\t\n"
768 "vshr.u16 q10, q12, #6+5\t\n"
769 "vmovn.u16 d5, q11\t\n"
770 "vmovn.u16 d6, q10\t\n"
771 "vshl.u8 d4, d4, #3\t\n"
772 "vshl.u8 d5, d5, #2\t\n"
773 "vshl.u8 d6, d6, #3\t\n"
774 "vsri.u8 d4, d4, #5\t\n"
775 "vsri.u8 d5, d5, #6\t\n"
776 "vsri.u8 d6, d6, #5\t\n"
778 "vmull.u8 q10, d31, d0\n\t"
779 "vmull.u8 q11, d31, d1\n\t"
780 "vmull.u8 q12, d31, d2\n\t"
781 "vmull.u8 q13, d31, d3\n\t"
782 "vrshr.u16 q8, q10, #8\n\t"
783 "vrshr.u16 q9, q11, #8\n\t"
784 "vraddhn.u16 d20, q10, q8\n\t"
785 "vraddhn.u16 d21, q11, q9\n\t"
786 "vrshr.u16 q9, q13, #8\n\t"
787 "vrshr.u16 q8, q12, #8\n\t"
788 "vraddhn.u16 d23, q13, q9\n\t"
789 "vraddhn.u16 d22, q12, q8\n\t"
791 /* duplicate in 4/2/1 & 8pix vsns */
792 "vmvn.8 d30, d23\n\t"
793 "vmull.u8 q14, d30, d6\n\t"
794 "vmull.u8 q13, d30, d5\n\t"
795 "vmull.u8 q12, d30, d4\n\t"
796 "vrshr.u16 q8, q14, #8\n\t"
797 "vrshr.u16 q9, q13, #8\n\t"
798 "vraddhn.u16 d6, q14, q8\n\t"
799 "vrshr.u16 q8, q12, #8\n\t"
800 "vraddhn.u16 d5, q13, q9\n\t"
801 "vqadd.u8 d6, d6, d22\n\t" /* moved up */
802 "vraddhn.u16 d4, q12, q8\n\t"
803 /* intentionally don't calculate alpha */
804 /* result in d4-d6 */
806 /* "vqadd.u8 d6, d6, d22\n\t" ** moved up */
807 "vqadd.u8 d5, d5, d21\n\t"
808 "vqadd.u8 d4, d4, d20\n\t"
810 /* pack 8888 {d20-d23} to 0565 q10 */
811 "vshll.u8 q10, d6, #8\n\t"
812 "vshll.u8 q3, d5, #8\n\t"
813 "vshll.u8 q2, d4, #8\n\t"
814 "vsri.u16 q10, q3, #5\t\n"
815 "vsri.u16 q10, q2, #11\t\n"
820 "vst1.16 {q10}, [%[keep_dst]]\n\t"
822 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
824 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
825 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
835 void *dst4=0, *dst2=0;
838 dst_line += dst_stride;
840 mask_line += mask_stride;
844 #if 1 /* #ifndef USE_GCC_INLINE_ASM */
846 uint16x8_t dval, temp;
847 uint8x8x4_t sval8temp;
851 alpha = vreinterpret_u8_u32 (vld1_lane_u32 ((void *)mask, vreinterpret_u32_u8 (alpha),1));
852 dval = vreinterpretq_u16_u64 (vld1q_lane_u64 ((void *)dst, vreinterpretq_u64_u16 (dval),1));
859 alpha = vreinterpret_u8_u16 (vld1_lane_u16 ((void *)mask, vreinterpret_u16_u8 (alpha),1));
860 dval = vreinterpretq_u16_u32 (vld1q_lane_u32 ((void *)dst, vreinterpretq_u32_u16 (dval),1));
867 alpha = vld1_lane_u8 ((void *)mask, alpha,1);
868 dval = vld1q_lane_u16 ((void *)dst, dval,1);
871 sval8temp = neon8mul (sval8, alpha);
872 temp = pack0565 (neon8qadd (sval8temp, neon8mul (unpack0565 (dval), vmvn_u8 (sval8temp.val[3]))));
875 vst1q_lane_u16 ((void *)dst, temp,1);
877 vst1q_lane_u32 ((void *)dst2, vreinterpretq_u32_u16 (temp),1);
879 vst1q_lane_u64 ((void *)dst4, vreinterpretq_u64_u16 (temp),1);
881 /* this code has some bug (does not pass blitters-test) */
883 "vdup.32 d0, %[src]\n\t"
884 "vdup.8 d1, d0[1]\n\t"
885 "vdup.8 d2, d0[2]\n\t"
886 "vdup.8 d3, d0[3]\n\t"
887 "vdup.8 d0, d0[0]\n\t"
892 "vld1.64 {d25}, [%[dst]]\n\t"
893 "vld1.32 {d31[1]}, [%[mask]]\n\t"
894 "mov %[dst4], %[dst]\t\n"
895 "add %[mask], %[mask], #4\t\n"
896 "add %[dst], %[dst], #4*2\t\n"
901 "vld1.32 {d24[1]}, [%[dst]]\n\t"
902 "vld1.16 {d31[1]}, [%[mask]]\n\t"
903 "mov %[dst2], %[dst]\t\n"
904 "add %[mask], %[mask], #2\t\n"
905 "add %[dst], %[dst], #2*2\t\n"
910 "vld1.16 {d24[1]}, [%[dst]]\n\t"
911 "vld1.8 {d31[1]}, [%[mask]]\n\t"
914 /* expand 0565 q12 to 8888 {d4-d7} */
915 "vmovn.u16 d4, q12\t\n"
916 "vshr.u16 q11, q12, #5\t\n"
917 "vshr.u16 q10, q12, #6+5\t\n"
918 "vmovn.u16 d5, q11\t\n"
919 "vmovn.u16 d6, q10\t\n"
920 "vshl.u8 d4, d4, #3\t\n"
921 "vshl.u8 d5, d5, #2\t\n"
922 "vshl.u8 d6, d6, #3\t\n"
923 "vsri.u8 d4, d4, #5\t\n"
924 "vsri.u8 d5, d5, #6\t\n"
925 "vsri.u8 d6, d6, #5\t\n"
927 "vmull.u8 q10, d31, d0\n\t"
928 "vmull.u8 q11, d31, d1\n\t"
929 "vmull.u8 q12, d31, d2\n\t"
930 "vmull.u8 q13, d31, d3\n\t"
931 "vrshr.u16 q8, q10, #8\n\t"
932 "vrshr.u16 q9, q11, #8\n\t"
933 "vraddhn.u16 d20, q10, q8\n\t"
934 "vraddhn.u16 d21, q11, q9\n\t"
935 "vrshr.u16 q9, q13, #8\n\t"
936 "vrshr.u16 q8, q12, #8\n\t"
937 "vraddhn.u16 d23, q13, q9\n\t"
938 "vraddhn.u16 d22, q12, q8\n\t"
940 /* duplicate in 4/2/1 & 8pix vsns */
941 "vmvn.8 d30, d23\n\t"
942 "vmull.u8 q14, d30, d6\n\t"
943 "vmull.u8 q13, d30, d5\n\t"
944 "vmull.u8 q12, d30, d4\n\t"
945 "vrshr.u16 q8, q14, #8\n\t"
946 "vrshr.u16 q9, q13, #8\n\t"
947 "vraddhn.u16 d6, q14, q8\n\t"
948 "vrshr.u16 q8, q12, #8\n\t"
949 "vraddhn.u16 d5, q13, q9\n\t"
950 "vqadd.u8 d6, d6, d22\n\t" /* moved up */
951 "vraddhn.u16 d4, q12, q8\n\t"
952 /* intentionally don't calculate alpha */
953 /* result in d4-d6 */
955 /* "vqadd.u8 d6, d6, d22\n\t" ** moved up */
956 "vqadd.u8 d5, d5, d21\n\t"
957 "vqadd.u8 d4, d4, d20\n\t"
959 /* pack 8888 {d20-d23} to 0565 q10 */
960 "vshll.u8 q10, d6, #8\n\t"
961 "vshll.u8 q3, d5, #8\n\t"
962 "vshll.u8 q2, d4, #8\n\t"
963 "vsri.u16 q10, q3, #5\t\n"
964 "vsri.u16 q10, q2, #11\t\n"
967 "beq skip_store1\t\n"
968 "vst1.16 {d20[1]}, [%[dst]]\t\n"
971 "beq skip_store2\t\n"
972 "vst1.32 {d20[1]}, [%[dst2]]\t\n"
975 "beq skip_store4\t\n"
976 "vst1.16 {d21}, [%[dst4]]\t\n"
979 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [dst4] "+r" (dst4), [dst2] "+r" (dst2)
981 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
982 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
991 neon_composite_over_n_8_8888 (pixman_implementation_t * impl,
993 pixman_image_t * src_image,
994 pixman_image_t * mask_image,
995 pixman_image_t * dst_image,
1006 uint32_t *dst_line, *dst;
1007 uint8_t *mask_line, *mask;
1008 int dst_stride, mask_stride;
1012 uint8x8_t mask_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0101010100000000ULL));
1013 uint8x8_t alpha_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0707070703030303ULL));
1015 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1017 /* bail out if fully transparent */
1022 sval2 = vreinterpret_u8_u32 (vdup_n_u32 (src));
1023 sval8.val[0] = vdup_lane_u8 (sval2, 0);
1024 sval8.val[1] = vdup_lane_u8 (sval2, 1);
1025 sval8.val[2] = vdup_lane_u8 (sval2, 2);
1026 sval8.val[3] = vdup_lane_u8 (sval2, 3);
1028 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1029 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1033 /* Use overlapping 8-pixel method, modified to avoid
1034 * rewritten dest being reused
1038 uint32_t *keep_dst = 0;
1041 dst_line += dst_stride;
1043 mask_line += mask_stride;
1046 #ifndef USE_GCC_INLINE_ASM
1048 uint8x8x4_t dval, temp;
1050 alpha = vld1_u8 ((void *)mask);
1051 dval = vld4_u8 ((void *)dst);
1054 temp = neon8mul (sval8, alpha);
1055 dval = neon8mul (dval, vmvn_u8 (temp.val[3]));
1056 temp = neon8qadd (temp, dval);
1064 alpha = vld1_u8 ((void *)mask);
1065 dval = vld4_u8 ((void *)dst);
1067 vst4_u8 ((void *)keep_dst, temp);
1070 temp = neon8mul (sval8, alpha);
1071 dval = neon8mul (dval, vmvn_u8 (temp.val[3]));
1072 temp = neon8qadd (temp, dval);
1078 vst4_u8 ((void *)keep_dst, temp);
1081 "vdup.32 d0, %[src]\n\t"
1082 "vdup.8 d1, d0[1]\n\t"
1083 "vdup.8 d2, d0[2]\n\t"
1084 "vdup.8 d3, d0[3]\n\t"
1085 "vdup.8 d0, d0[0]\n\t"
1087 "vld4.8 {d4-d7}, [%[dst]]\n\t"
1088 "vld1.8 {d31}, [%[mask]]\n\t"
1089 "mov %[keep_dst], %[dst]\n\t"
1091 "and ip, %[w], #7\n\t"
1092 "add %[mask], %[mask], ip\n\t"
1093 "add %[dst], %[dst], ip, LSL#2\n\t"
1094 "subs %[w], %[w], ip\n\t"
1098 "vld4.8 {d4-d7}, [%[dst]]!\n\t"
1099 "vld1.8 {d31}, [%[mask]]!\n\t"
1100 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
1101 "sub %[keep_dst], %[dst], #8*4\n\t"
1102 "subs %[w], %[w], #8\n\t"
1105 "vmull.u8 q10, d31, d0\n\t"
1106 "vmull.u8 q11, d31, d1\n\t"
1107 "vmull.u8 q12, d31, d2\n\t"
1108 "vmull.u8 q13, d31, d3\n\t"
1109 "vrshr.u16 q8, q10, #8\n\t"
1110 "vrshr.u16 q9, q11, #8\n\t"
1111 "vraddhn.u16 d20, q10, q8\n\t"
1112 "vraddhn.u16 d21, q11, q9\n\t"
1113 "vrshr.u16 q9, q13, #8\n\t"
1114 "vrshr.u16 q8, q12, #8\n\t"
1115 "vraddhn.u16 d23, q13, q9\n\t"
1116 "vraddhn.u16 d22, q12, q8\n\t"
1118 "vmvn.8 d30, d23\n\t"
1119 "vmull.u8 q12, d30, d4\n\t"
1120 "vmull.u8 q13, d30, d5\n\t"
1121 "vmull.u8 q14, d30, d6\n\t"
1122 "vmull.u8 q15, d30, d7\n\t"
1124 "vrshr.u16 q8, q12, #8\n\t"
1125 "vrshr.u16 q9, q13, #8\n\t"
1126 "vraddhn.u16 d4, q12, q8\n\t"
1127 "vrshr.u16 q8, q14, #8\n\t"
1128 "vraddhn.u16 d5, q13, q9\n\t"
1129 "vrshr.u16 q9, q15, #8\n\t"
1130 "vraddhn.u16 d6, q14, q8\n\t"
1131 "vraddhn.u16 d7, q15, q9\n\t"
1132 /* result in d4-d7 */
1134 "vqadd.u8 d20, d4, d20\n\t"
1135 "vqadd.u8 d21, d5, d21\n\t"
1136 "vqadd.u8 d22, d6, d22\n\t"
1137 "vqadd.u8 d23, d7, d23\n\t"
1142 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
1144 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
1146 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
1147 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
1160 dst_line += dst_stride;
1162 mask_line += mask_stride;
1167 uint8x8_t dval, temp, res;
1170 vreinterpret_u8_u16 (vld1_dup_u16 ((void *)mask)), mask_selector);
1171 dval = vld1_u8 ((void *)dst);
1173 temp = neon2mul (sval2, alpha);
1175 temp, neon2mul (dval, vtbl1_u8 (vmvn_u8 (temp), alpha_selector)));
1177 vst1_u8 ((void *)dst, res);
1186 uint8x8_t dval, temp, res;
1188 alpha = vtbl1_u8 (vld1_dup_u8 ((void *)mask), mask_selector);
1189 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)dst));
1191 temp = neon2mul (sval2, alpha);
1193 temp, neon2mul (dval, vtbl1_u8 (vmvn_u8 (temp), alpha_selector)));
1195 vst1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (res), 0);
1202 neon_composite_add_8888_8_8 (pixman_implementation_t * impl,
1204 pixman_image_t * src_image,
1205 pixman_image_t * mask_image,
1206 pixman_image_t * dst_image,
1216 uint8_t *dst_line, *dst;
1217 uint8_t *mask_line, *mask;
1218 int dst_stride, mask_stride;
1223 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
1224 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1225 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1226 sa = vdup_n_u8 ((src) >> 24);
1230 /* Use overlapping 8-pixel method, modified to avoid rewritten dest being reused */
1234 dst_line += dst_stride;
1236 mask_line += mask_stride;
1239 uint8x8_t mval, dval, res;
1242 mval = vld1_u8 ((void *)mask);
1243 dval = vld1_u8 ((void *)dst);
1246 res = vqadd_u8 (neon2mul (mval, sa), dval);
1254 mval = vld1_u8 ((void *)mask);
1255 dval = vld1_u8 ((void *)dst);
1256 vst1_u8 ((void *)keep_dst, res);
1259 res = vqadd_u8 (neon2mul (mval, sa), dval);
1265 vst1_u8 ((void *)keep_dst, res);
1270 /* Use 4/2/1 load/store method to handle 1-7 pixels */
1274 dst_line += dst_stride;
1276 mask_line += mask_stride;
1279 uint8x8_t mval = sa, dval = sa, res;
1280 uint8_t *dst4 = 0, *dst2 = 0;
1284 mval = vreinterpret_u8_u32 (
1285 vld1_lane_u32 ((void *)mask, vreinterpret_u32_u8 (mval), 1));
1286 dval = vreinterpret_u8_u32 (
1287 vld1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (dval), 1));
1296 mval = vreinterpret_u8_u16 (
1297 vld1_lane_u16 ((void *)mask, vreinterpret_u16_u8 (mval), 1));
1298 dval = vreinterpret_u8_u16 (
1299 vld1_lane_u16 ((void *)dst, vreinterpret_u16_u8 (dval), 1));
1307 mval = vld1_lane_u8 (mask, mval, 1);
1308 dval = vld1_lane_u8 (dst, dval, 1);
1311 res = vqadd_u8 (neon2mul (mval, sa), dval);
1314 vst1_lane_u8 (dst, res, 1);
1316 vst1_lane_u16 ((void *)dst2, vreinterpret_u16_u8 (res), 1);
1318 vst1_lane_u32 ((void *)dst4, vreinterpret_u32_u8 (res), 1);
1323 #ifdef USE_GCC_INLINE_ASM
1326 neon_composite_src_16_16 (pixman_implementation_t * impl,
1328 pixman_image_t * src_image,
1329 pixman_image_t * mask_image,
1330 pixman_image_t * dst_image,
1340 uint16_t *dst_line, *src_line;
1341 uint32_t dst_stride, src_stride;
1343 if (!height || !width)
1346 /* We simply copy 16-bit-aligned pixels from one place to another. */
1347 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
1348 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1350 /* Preload the first input scanline */
1352 uint16_t *src_ptr = src_line;
1353 uint32_t count = width;
1357 " subs %[count], %[count], #32 \n"
1359 " add %[src], %[src], #64 \n"
1362 /* Clobbered input registers marked as input/outputs */
1363 : [src] "+r" (src_ptr), [count] "+r" (count)
1364 : /* no unclobbered inputs */
1371 uint16_t *dst_ptr = dst_line;
1372 uint16_t *src_ptr = src_line;
1373 uint32_t count = width;
1376 /* Uses multi-register access and preloading to maximise bandwidth.
1377 * Each pixel is one halfword, so a quadword contains 8px.
1378 * Preload frequency assumed a 64-byte cacheline.
1381 " cmp %[count], #64 \n"
1382 " blt 1f @ skip oversized fragments \n"
1383 "0: @ start with eight quadwords at a time \n"
1384 /* preload from next scanline */
1385 " pld [%[src], %[src_stride], LSL #1] \n"
1386 " sub %[count], %[count], #64 \n"
1387 " vld1.16 {d16, d17, d18, d19}, [%[src]]! \n"
1388 " vld1.16 {d20, d21, d22, d23}, [%[src]]! \n"
1389 /* preload from next scanline */
1390 " pld [%[src], %[src_stride], LSL #1] \n"
1391 " vld1.16 {d24, d25, d26, d27}, [%[src]]! \n"
1392 " vld1.16 {d28, d29, d30, d31}, [%[src]]! \n"
1393 " cmp %[count], #64 \n"
1394 " vst1.16 {d16, d17, d18, d19}, [%[dst]]! \n"
1395 " vst1.16 {d20, d21, d22, d23}, [%[dst]]! \n"
1396 " vst1.16 {d24, d25, d26, d27}, [%[dst]]! \n"
1397 " vst1.16 {d28, d29, d30, d31}, [%[dst]]! \n"
1399 " cmp %[count], #0 \n"
1400 " beq 7f @ aligned fastpath \n"
1401 "1: @ four quadwords \n"
1402 " tst %[count], #32 \n"
1403 " beq 2f @ skip oversized fragment \n"
1404 /* preload from next scanline */
1405 " pld [%[src], %[src_stride], LSL #1] \n"
1406 " vld1.16 {d16, d17, d18, d19}, [%[src]]! \n"
1407 " vld1.16 {d20, d21, d22, d23}, [%[src]]! \n"
1408 " vst1.16 {d16, d17, d18, d19}, [%[dst]]! \n"
1409 " vst1.16 {d20, d21, d22, d23}, [%[dst]]! \n"
1410 "2: @ two quadwords \n"
1411 " tst %[count], #16 \n"
1412 " beq 3f @ skip oversized fragment \n"
1413 /* preload from next scanline */
1414 " pld [%[src], %[src_stride], LSL #1] \n"
1415 " vld1.16 {d16, d17, d18, d19}, [%[src]]! \n"
1416 " vst1.16 {d16, d17, d18, d19}, [%[dst]]! \n"
1417 "3: @ one quadword \n"
1418 " tst %[count], #8 \n"
1419 " beq 4f @ skip oversized fragment \n"
1420 " vld1.16 {d16, d17}, [%[src]]! \n"
1421 " vst1.16 {d16, d17}, [%[dst]]! \n"
1422 "4: @ one doubleword \n"
1423 " tst %[count], #4 \n"
1424 " beq 5f @ skip oversized fragment \n"
1425 " vld1.16 {d16}, [%[src]]! \n"
1426 " vst1.16 {d16}, [%[dst]]! \n"
1428 " tst %[count], #2 \n"
1429 " beq 6f @ skip oversized fragment \n"
1430 " ldr %[tmp], [%[src]], #4 \n"
1431 " str %[tmp], [%[dst]], #4 \n"
1432 "6: @ one halfword \n"
1433 " tst %[count], #1 \n"
1434 " beq 7f @ skip oversized fragment \n"
1435 " ldrh %[tmp], [%[src]] \n"
1436 " strh %[tmp], [%[dst]] \n"
1439 /* Clobbered input registers marked as input/outputs */
1440 : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr),
1441 [count] "+r" (count), [tmp] "+r" (tmp)
1443 /* Unclobbered input */
1444 : [src_stride] "r" (src_stride)
1446 /* Clobbered vector registers */
1447 : "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
1448 "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31", "cc", "memory"
1451 src_line += src_stride;
1452 dst_line += dst_stride;
1456 #endif /* USE_GCC_INLINE_ASM */
1459 neon_composite_src_24_16 (pixman_implementation_t * impl,
1461 pixman_image_t * src_image,
1462 pixman_image_t * mask_image,
1463 pixman_image_t * dst_image,
1475 uint32_t dst_stride, src_stride;
1477 if (!width || !height)
1480 /* We simply copy pixels from one place to another,
1481 * assuming that the source's alpha is opaque.
1483 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1484 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1486 /* Preload the first input scanline */
1488 uint8_t *src_ptr = (uint8_t*) src_line;
1489 uint32_t count = (width + 15) / 16;
1491 #ifdef USE_GCC_INLINE_ASM
1494 " subs %[count], %[count], #1 \n"
1496 " add %[src], %[src], #64 \n"
1499 /* Clobbered input registers marked as input/outputs */
1500 : [src] "+r" (src_ptr), [count] "+r" (count)
1501 : /* no unclobbered inputs */
1516 uint16_t *dst_ptr = dst_line;
1517 uint32_t *src_ptr = src_line;
1518 uint32_t count = width;
1519 const uint32_t rb_mask = 0x1F;
1520 const uint32_t g_mask = 0x3F;
1522 /* If you're going to complain about a goto, take a long hard look
1523 * at the massive blocks of assembler this skips over. ;-)
1528 #ifdef USE_GCC_INLINE_ASM
1530 /* This is not as aggressive as the RGB565-source case.
1531 * Generally the source is in cached RAM when the formats are
1532 * different, so we use preload.
1534 * We don't need to blend, so we are not reading from the
1535 * uncached framebuffer.
1538 " cmp %[count], #16 \n"
1539 " blt 1f @ skip oversized fragments \n"
1540 "0: @ start with sixteen pixels at a time \n"
1541 " sub %[count], %[count], #16 \n"
1542 " pld [%[src], %[src_stride], lsl #2] @ preload from next scanline \n"
1543 " vld4.8 {d0, d1, d2, d3}, [%[src]]! @ d3 is alpha and ignored, d2-0 are rgb. \n"
1544 " vld4.8 {d4, d5, d6, d7}, [%[src]]! @ d7 is alpha and ignored, d6-4 are rgb. \n"
1545 " vshll.u8 q8, d2, #8 @ expand first red for repacking \n"
1546 " vshll.u8 q10, d1, #8 @ expand first green for repacking \n"
1547 " vshll.u8 q11, d0, #8 @ expand first blue for repacking \n"
1548 " vshll.u8 q9, d6, #8 @ expand second red for repacking \n"
1549 " vsri.u16 q8, q10, #5 @ insert first green after red \n"
1550 " vshll.u8 q10, d5, #8 @ expand second green for repacking \n"
1551 " vsri.u16 q8, q11, #11 @ insert first blue after green \n"
1552 " vshll.u8 q11, d4, #8 @ expand second blue for repacking \n"
1553 " vsri.u16 q9, q10, #5 @ insert second green after red \n"
1554 " vsri.u16 q9, q11, #11 @ insert second blue after green \n"
1555 " cmp %[count], #16 \n"
1556 " vst1.16 {d16, d17, d18, d19}, [%[dst]]! @ store 16 pixels \n"
1558 "1: @ end of main loop \n"
1559 " cmp %[count], #8 @ can we still do an 8-pixel block? \n"
1561 " sub %[count], %[count], #8 \n"
1562 " pld [%[src], %[src_stride], lsl #2] @ preload from next scanline \n"
1563 " vld4.8 {d0, d1, d2, d3}, [%[src]]! @ d3 is alpha and ignored, d2-0 are rgb. \n"
1564 " vshll.u8 q8, d2, #8 @ expand first red for repacking \n"
1565 " vshll.u8 q10, d1, #8 @ expand first green for repacking \n"
1566 " vshll.u8 q11, d0, #8 @ expand first blue for repacking \n"
1567 " vsri.u16 q8, q10, #5 @ insert first green after red \n"
1568 " vsri.u16 q8, q11, #11 @ insert first blue after green \n"
1569 " vst1.16 {d16, d17}, [%[dst]]! @ store 8 pixels \n"
1572 /* Clobbered input and working registers marked as input/outputs */
1573 : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr), [count] "+r" (count)
1575 /* Unclobbered input */
1576 : [src_stride] "r" (src_stride)
1578 /* Clobbered vector registers */
1580 /* NB: these are the quad aliases of the
1581 * double registers used in the asm
1583 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17",
1584 "d18", "d19", "d20", "d21", "d22", "d23", "cc", "memory"
1587 /* A copy of the above code, in intrinsics-form. */
1590 uint8x8x4_t pixel_set_a, pixel_set_b;
1591 uint16x8_t red_a, green_a, blue_a;
1592 uint16x8_t red_b, green_b, blue_b;
1593 uint16x8_t dest_pixels_a, dest_pixels_b;
1596 __pld (src_ptr + src_stride);
1597 pixel_set_a = vld4_u8 ((uint8_t*)(src_ptr));
1598 pixel_set_b = vld4_u8 ((uint8_t*)(src_ptr + 8));
1601 red_a = vshll_n_u8 (pixel_set_a.val[2], 8);
1602 green_a = vshll_n_u8 (pixel_set_a.val[1], 8);
1603 blue_a = vshll_n_u8 (pixel_set_a.val[0], 8);
1605 red_b = vshll_n_u8 (pixel_set_b.val[2], 8);
1606 green_b = vshll_n_u8 (pixel_set_b.val[1], 8);
1607 blue_b = vshll_n_u8 (pixel_set_b.val[0], 8);
1609 dest_pixels_a = vsriq_n_u16 (red_a, green_a, 5);
1610 dest_pixels_b = vsriq_n_u16 (red_b, green_b, 5);
1612 dest_pixels_a = vsriq_n_u16 (dest_pixels_a, blue_a, 11);
1613 dest_pixels_b = vsriq_n_u16 (dest_pixels_b, blue_b, 11);
1615 /* There doesn't seem to be an intrinsic for the
1616 * double-quadword variant
1618 vst1q_u16 (dst_ptr, dest_pixels_a);
1619 vst1q_u16 (dst_ptr + 8, dest_pixels_b);
1626 uint8x8x4_t pixel_set_a;
1627 uint16x8_t red_a, green_a, blue_a;
1628 uint16x8_t dest_pixels_a;
1630 __pld (src_ptr + src_stride);
1632 pixel_set_a = vld4_u8 ((uint8_t*)(src_ptr));
1635 red_a = vshll_n_u8 (pixel_set_a.val[2], 8);
1636 green_a = vshll_n_u8 (pixel_set_a.val[1], 8);
1637 blue_a = vshll_n_u8 (pixel_set_a.val[0], 8);
1639 dest_pixels_a = vsriq_n_u16 (red_a, green_a, 5);
1640 dest_pixels_a = vsriq_n_u16 (dest_pixels_a, blue_a, 11);
1642 vst1q_u16 (dst_ptr, dest_pixels_a);
1646 #endif /* USE_GCC_INLINE_ASM */
1650 __pld (src_ptr + src_stride);
1654 uint32_t src_pixel_a = *src_ptr++;
1655 uint32_t src_pixel_b = *src_ptr++;
1657 /* ARM is really good at shift-then-ALU ops. */
1658 /* This should be a total of six shift-ANDs and five shift-ORs. */
1659 uint32_t dst_pixels_a;
1660 uint32_t dst_pixels_b;
1662 dst_pixels_a = ((src_pixel_a >> 3) & rb_mask);
1663 dst_pixels_a |= ((src_pixel_a >> 10) & g_mask) << 5;
1664 dst_pixels_a |= ((src_pixel_a >> 19) & rb_mask) << 11;
1666 dst_pixels_b = ((src_pixel_b >> 3) & rb_mask);
1667 dst_pixels_b |= ((src_pixel_b >> 10) & g_mask) << 5;
1668 dst_pixels_b |= ((src_pixel_b >> 19) & rb_mask) << 11;
1670 /* little-endian mode only */
1671 *((uint32_t*) dst_ptr) = dst_pixels_a | (dst_pixels_b << 16);
1678 uint32_t src_pixel = *src_ptr++;
1680 /* ARM is really good at shift-then-ALU ops.
1681 * This block should end up as three shift-ANDs
1682 * and two shift-ORs.
1684 uint32_t tmp_blue = (src_pixel >> 3) & rb_mask;
1685 uint32_t tmp_green = (src_pixel >> 10) & g_mask;
1686 uint32_t tmp_red = (src_pixel >> 19) & rb_mask;
1687 uint16_t dst_pixel = (tmp_red << 11) | (tmp_green << 5) | tmp_blue;
1689 *dst_ptr++ = dst_pixel;
1693 src_line += src_stride;
1694 dst_line += dst_stride;
1698 static pixman_bool_t
1699 pixman_fill_neon (uint32_t *bits,
1708 uint32_t byte_stride, color;
1711 /* stride is always multiple of 32bit units in pixman */
1712 byte_stride = stride * sizeof(uint32_t);
1717 dst = ((char *) bits) + y * byte_stride + x;
1719 color = _xor << 24 | _xor << 16 | _xor << 8 | _xor;
1723 dst = ((char *) bits) + y * byte_stride + x * 2;
1725 color = _xor << 16 | _xor;
1726 width *= 2; /* width to bytes */
1730 dst = ((char *) bits) + y * byte_stride + x * 4;
1732 width *= 4; /* width to bytes */
1739 #ifdef USE_GCC_INLINE_ASM
1742 /* We have a special case for such small widths that don't allow
1743 * us to use wide 128-bit stores anyway. We don't waste time
1744 * trying to align writes, since there are only very few of them anyway
1747 "cmp %[height], #0\n"/* Check if empty fill */
1749 "vdup.32 d0, %[color]\n"/* Fill the color to neon req */
1751 /* Check if we have a such width that can easily be handled by single
1752 * operation for each scanline. This significantly reduces the number
1753 * of test/branch instructions for each scanline
1755 "cmp %[width], #8\n"
1757 "cmp %[width], #4\n"
1759 "cmp %[width], #2\n"
1762 /* Loop starts here for each scanline */
1764 "mov r4, %[dst]\n" /* Starting address of the current line */
1765 "tst %[width], #8\n"
1767 "vst1.8 {d0}, [r4]!\n"
1769 "tst %[width], #4\n"
1771 "str %[color], [r4], #4\n"
1773 "tst %[width], #2\n"
1775 "strh %[color], [r4], #2\n"
1777 "tst %[width], #1\n"
1779 "strb %[color], [r4], #1\n"
1782 "subs %[height], %[height], #1\n"
1783 "add %[dst], %[dst], %[byte_stride]\n"
1787 /* Special fillers for those widths that we can do with single operation */
1789 "subs %[height], %[height], #1\n"
1790 "vst1.8 {d0}, [%[dst]]\n"
1791 "add %[dst], %[dst], %[byte_stride]\n"
1796 "subs %[height], %[height], #1\n"
1797 "str %[color], [%[dst]]\n"
1798 "add %[dst], %[dst], %[byte_stride]\n"
1803 "subs %[height], %[height], #1\n"
1804 "strh %[color], [%[dst]]\n"
1805 "add %[dst], %[dst], %[byte_stride]\n"
1809 : [height] "+r" (height), [dst] "+r" (dst)
1810 : [color] "r" (color), [width] "r" (width),
1811 [byte_stride] "r" (byte_stride)
1812 : "memory", "cc", "d0", "r4");
1817 "cmp %[height], #0\n"/* Check if empty fill */
1819 "vdup.32 q0, %[color]\n"/* Fill the color to neon req */
1821 /* Loop starts here for each scanline */
1823 "mov r4, %[dst]\n"/* Starting address of the current line */
1824 "mov r5, %[width]\n"/* We're going to write this many bytes */
1825 "ands r6, r4, #15\n"/* Are we at the 128-bit aligned address? */
1826 "beq 2f\n"/* Jump to the best case */
1828 /* We're not 128-bit aligned: However, we know that we can get to the
1829 next aligned location, since the fill is at least 16 bytes wide */
1830 "rsb r6, r6, #16\n" /* We would need to go forward this much */
1831 "sub r5, r5, r6\n"/* Update bytes left */
1834 "vst1.8 {d0[0]}, [r4]!\n"/* Store byte, now we are word aligned */
1838 "vst1.16 {d0[0]}, [r4, :16]!\n"/* Store half word, now we are 16-bit aligned */
1842 "vst1.32 {d0[0]}, [r4, :32]!\n"/* Store word, now we're 32-bit aligned */
1846 "vst1.64 {d0}, [r4, :64]!\n"/* Store qword now we're 64-bit aligned */
1848 /* The good case: We're 128-bit aligned for this scanline */
1850 "and r6, r5, #15\n"/* Number of tailing bytes */
1851 "cmp r5, r6\n"/* Do we have at least one qword to write? */
1852 "beq 6f\n"/* No, we just write the tail */
1853 "lsr r5, r5, #4\n"/* This many full qwords to write */
1855 /* The main block: Do 128-bit aligned writes */
1858 "vst1.64 {d0, d1}, [r4, :128]!\n"
1861 /* Handle the tailing bytes: Do 64, 32, 16 and 8-bit aligned writes as needed.
1862 We know that we're currently at 128-bit aligned address, so we can just
1863 pick the biggest operations that the remaining write width allows */
1869 "vst1.64 {d0}, [r4, :64]!\n"
1873 "vst1.32 {d0[0]}, [r4, :32]!\n"
1877 "vst1.16 {d0[0]}, [r4, :16]!\n"
1881 "vst1.8 {d0[0]}, [r4]!\n"
1884 /* Handle the next scanline */
1885 "subs %[height], %[height], #1\n"
1886 "add %[dst], %[dst], %[byte_stride]\n"
1889 : [height] "+r" (height), [dst] "+r" (dst)
1890 : [color] "r" (color), [width] "r" (width),
1891 [byte_stride] "r" (byte_stride)
1892 : "memory", "cc", "d0", "d1", "r4", "r5", "r6");
1898 /* TODO: intrinsic version for armcc */
1904 /* TODO: is there a more generic way of doing this being introduced? */
1905 #define NEON_SCANLINE_BUFFER_PIXELS (1024)
1908 neon_quadword_copy (void * dst,
1910 uint32_t count, /* of quadwords */
1911 uint32_t trailer_count /* of bytes */)
1913 uint8_t *t_dst = dst, *t_src = src;
1915 /* Uses aligned multi-register loads to maximise read bandwidth
1916 * on uncached memory such as framebuffers
1917 * The accesses do not have the aligned qualifiers, so that the copy
1918 * may convert between aligned-uncached and unaligned-cached memory.
1919 * It is assumed that the CPU can infer alignedness from the address.
1922 #ifdef USE_GCC_INLINE_ASM
1925 " cmp %[count], #8 \n"
1926 " blt 1f @ skip oversized fragments \n"
1927 "0: @ start with eight quadwords at a time \n"
1928 " sub %[count], %[count], #8 \n"
1929 " vld1.8 {d16, d17, d18, d19}, [%[src]]! \n"
1930 " vld1.8 {d20, d21, d22, d23}, [%[src]]! \n"
1931 " vld1.8 {d24, d25, d26, d27}, [%[src]]! \n"
1932 " vld1.8 {d28, d29, d30, d31}, [%[src]]! \n"
1933 " cmp %[count], #8 \n"
1934 " vst1.8 {d16, d17, d18, d19}, [%[dst]]! \n"
1935 " vst1.8 {d20, d21, d22, d23}, [%[dst]]! \n"
1936 " vst1.8 {d24, d25, d26, d27}, [%[dst]]! \n"
1937 " vst1.8 {d28, d29, d30, d31}, [%[dst]]! \n"
1939 "1: @ four quadwords \n"
1940 " tst %[count], #4 \n"
1941 " beq 2f @ skip oversized fragment \n"
1942 " vld1.8 {d16, d17, d18, d19}, [%[src]]! \n"
1943 " vld1.8 {d20, d21, d22, d23}, [%[src]]! \n"
1944 " vst1.8 {d16, d17, d18, d19}, [%[dst]]! \n"
1945 " vst1.8 {d20, d21, d22, d23}, [%[dst]]! \n"
1946 "2: @ two quadwords \n"
1947 " tst %[count], #2 \n"
1948 " beq 3f @ skip oversized fragment \n"
1949 " vld1.8 {d16, d17, d18, d19}, [%[src]]! \n"
1950 " vst1.8 {d16, d17, d18, d19}, [%[dst]]! \n"
1951 "3: @ one quadword \n"
1952 " tst %[count], #1 \n"
1953 " beq 4f @ skip oversized fragment \n"
1954 " vld1.8 {d16, d17}, [%[src]]! \n"
1955 " vst1.8 {d16, d17}, [%[dst]]! \n"
1958 /* Clobbered input registers marked as input/outputs */
1959 : [dst] "+r" (t_dst), [src] "+r" (t_src), [count] "+r" (count)
1961 /* No unclobbered inputs */
1964 /* Clobbered vector registers */
1965 : "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25",
1966 "d26", "d27", "d28", "d29", "d30", "d31", "cc", "memory");
1972 uint8x16x4_t t1 = vld4q_u8 (t_src);
1973 uint8x16x4_t t2 = vld4q_u8 (t_src + sizeof(uint8x16x4_t));
1975 t_src += sizeof(uint8x16x4_t) * 2;
1976 vst4q_u8 (t_dst, t1);
1977 vst4q_u8 (t_dst + sizeof(uint8x16x4_t), t2);
1978 t_dst += sizeof(uint8x16x4_t) * 2;
1984 uint8x16x4_t t1 = vld4q_u8 (t_src);
1986 t_src += sizeof(uint8x16x4_t);
1987 vst4q_u8 (t_dst, t1);
1988 t_dst += sizeof(uint8x16x4_t);
1993 uint8x8x4_t t1 = vld4_u8 (t_src);
1995 t_src += sizeof(uint8x8x4_t);
1996 vst4_u8 (t_dst, t1);
1997 t_dst += sizeof(uint8x8x4_t);
2002 uint8x16_t t1 = vld1q_u8 (t_src);
2004 t_src += sizeof(uint8x16_t);
2005 vst1q_u8 (t_dst, t1);
2006 t_dst += sizeof(uint8x16_t);
2009 #endif /* !USE_GCC_INLINE_ASM */
2013 if (trailer_count & 8)
2015 uint8x8_t t1 = vld1_u8 (t_src);
2017 t_src += sizeof(uint8x8_t);
2018 vst1_u8 (t_dst, t1);
2019 t_dst += sizeof(uint8x8_t);
2022 if (trailer_count & 4)
2024 *((uint32_t*) t_dst) = *((uint32_t*) t_src);
2030 if (trailer_count & 2)
2032 *((uint16_t*) t_dst) = *((uint16_t*) t_src);
2038 if (trailer_count & 1)
2040 *t_dst++ = *t_src++;
2046 solid_over_565_8_pix_neon (uint32_t glyph_colour,
2049 uint32_t dest_stride, /* bytes, not elements */
2050 uint32_t mask_stride,
2051 uint32_t count /* 8-pixel groups */)
2053 /* Inner loop of glyph blitter (solid colour, alpha mask) */
2055 #ifdef USE_GCC_INLINE_ASM
2058 " vld4.8 {d20[], d21[], d22[], d23[]}, [%[glyph_colour]] @ splat solid colour components \n"
2060 " vld1.16 {d0, d1}, [%[dest]] @ load first pixels from framebuffer \n"
2061 " vld1.8 {d17}, [%[in_mask]] @ load alpha mask of glyph \n"
2062 " vmull.u8 q9, d17, d23 @ apply glyph colour alpha to mask \n"
2063 " vshrn.u16 d17, q9, #8 @ reformat it to match original mask \n"
2064 " vmvn d18, d17 @ we need the inverse mask for the background \n"
2065 " vsli.u16 q3, q0, #5 @ duplicate framebuffer blue bits \n"
2066 " vshrn.u16 d2, q0, #8 @ unpack red from framebuffer pixels \n"
2067 " vshrn.u16 d4, q0, #3 @ unpack green \n"
2068 " vsri.u8 d2, d2, #5 @ duplicate red bits (extend 5 to 8) \n"
2069 " vshrn.u16 d6, q3, #2 @ unpack extended blue (truncate 10 to 8) \n"
2070 " vsri.u8 d4, d4, #6 @ duplicate green bits (extend 6 to 8) \n"
2071 " vmull.u8 q1, d2, d18 @ apply inverse mask to background red... \n"
2072 " vmull.u8 q2, d4, d18 @ ...green... \n"
2073 " vmull.u8 q3, d6, d18 @ ...blue \n"
2074 " subs %[count], %[count], #1 @ decrement/test loop counter \n"
2075 " vmlal.u8 q1, d17, d22 @ add masked foreground red... \n"
2076 " vmlal.u8 q2, d17, d21 @ ...green... \n"
2077 " vmlal.u8 q3, d17, d20 @ ...blue \n"
2078 " add %[in_mask], %[in_mask], %[mask_stride] @ advance mask pointer, while we wait \n"
2079 " vsri.16 q1, q2, #5 @ pack green behind red \n"
2080 " vsri.16 q1, q3, #11 @ pack blue into pixels \n"
2081 " vst1.16 {d2, d3}, [%[dest]] @ store composited pixels \n"
2082 " add %[dest], %[dest], %[dest_stride] @ advance framebuffer pointer \n"
2083 " bne 0b @ next please \n"
2085 /* Clobbered registers marked as input/outputs */
2086 : [dest] "+r" (dest), [in_mask] "+r" (in_mask), [count] "+r" (count)
2089 : [dest_stride] "r" (dest_stride), [mask_stride] "r" (mask_stride), [glyph_colour] "r" (&glyph_colour)
2091 /* Clobbers, including the inputs we modify, and potentially lots of memory */
2092 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d17", "d18", "d19",
2093 "d20", "d21", "d22", "d23", "d24", "d25", "cc", "memory"
2098 uint8x8x4_t solid_colour = vld4_dup_u8 ((uint8_t*) &glyph_colour);
2102 uint16x8_t pixels = vld1q_u16 (dest);
2103 uint8x8_t mask = vshrn_n_u16 (vmull_u8 (solid_colour.val[3], vld1_u8 (in_mask)), 8);
2104 uint8x8_t mask_image = vmvn_u8 (mask);
2106 uint8x8_t t_red = vshrn_n_u16 (pixels, 8);
2107 uint8x8_t t_green = vshrn_n_u16 (pixels, 3);
2108 uint8x8_t t_blue = vshrn_n_u16 (vsli_n_u8 (pixels, pixels, 5), 2);
2110 uint16x8_t s_red = vmull_u8 (vsri_n_u8 (t_red, t_red, 5), mask_image);
2111 uint16x8_t s_green = vmull_u8 (vsri_n_u8 (t_green, t_green, 6), mask_image);
2112 uint16x8_t s_blue = vmull_u8 (t_blue, mask_image);
2114 s_red = vmlal (s_red, mask, solid_colour.val[2]);
2115 s_green = vmlal (s_green, mask, solid_colour.val[1]);
2116 s_blue = vmlal (s_blue, mask, solid_colour.val[0]);
2118 pixels = vsri_n_u16 (s_red, s_green, 5);
2119 pixels = vsri_n_u16 (pixels, s_blue, 11);
2120 vst1q_u16 (dest, pixels);
2122 dest += dest_stride;
2123 mask += mask_stride;
2129 #if 0 /* this is broken currently */
2131 neon_composite_over_n_8_0565 (pixman_implementation_t * impl,
2133 pixman_image_t * src_image,
2134 pixman_image_t * mask_image,
2135 pixman_image_t * dst_image,
2146 uint16_t *dst_line, *aligned_line;
2148 uint32_t dst_stride, mask_stride;
2149 uint32_t kernel_count, copy_count, copy_tail;
2150 uint8_t kernel_offset, copy_offset;
2152 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2154 /* bail out if fully transparent or degenerate */
2159 if (width == 0 || height == 0)
2162 if (width > NEON_SCANLINE_BUFFER_PIXELS)
2164 /* split the blit, so we can use a fixed-size scanline buffer
2165 * TODO: there must be a more elegant way of doing this.
2168 for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
2170 neon_composite_over_n_8_0565 (
2172 src_image, mask_image, dst_image,
2173 src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
2174 (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
2180 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2181 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2183 /* keep within minimum number of aligned quadwords on width
2184 * while also keeping the minimum number of columns to process
2187 unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
2188 unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
2189 unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
2191 /* the fast copy should be quadword aligned */
2192 copy_offset = dst_line - ((uint16_t*) aligned_left);
2193 aligned_line = dst_line - copy_offset;
2194 copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
2197 if (aligned_right - aligned_left > ceiling_length)
2199 /* unaligned routine is tightest */
2200 kernel_count = (uint32_t) (ceiling_length >> 4);
2201 kernel_offset = copy_offset;
2205 /* aligned routine is equally tight, so it is safer to align */
2206 kernel_count = copy_count;
2210 /* We should avoid reading beyond scanline ends for safety */
2211 if (aligned_line < (dst_line - dest_x) ||
2212 (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
2214 /* switch to precise read */
2215 copy_offset = kernel_offset = 0;
2216 aligned_line = dst_line;
2217 kernel_count = (uint32_t) (ceiling_length >> 4);
2218 copy_count = (width * sizeof(*dst_line)) >> 4;
2219 copy_tail = (width * sizeof(*dst_line)) & 0xF;
2224 uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8]; /* deliberately not initialised */
2225 uint8_t glyph_line[NEON_SCANLINE_BUFFER_PIXELS + 8];
2228 /* row-major order */
2229 /* left edge, middle block, right edge */
2230 for ( ; y--; mask_line += mask_stride, aligned_line += dst_stride, dst_line += dst_stride)
2232 /* We don't want to overrun the edges of the glyph,
2233 * so realign the edge data into known buffers
2235 neon_quadword_copy (glyph_line + copy_offset, mask_line, width >> 4, width & 0xF);
2237 /* Uncached framebuffer access is really, really slow
2238 * if we do it piecemeal. It should be much faster if we
2239 * grab it all at once. One scanline should easily fit in
2240 * L1 cache, so this should not waste RAM bandwidth.
2242 neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
2244 /* Apply the actual filter */
2245 solid_over_565_8_pix_neon (
2246 src, scan_line + kernel_offset,
2247 glyph_line + kernel_offset, 8 * sizeof(*dst_line),
2250 /* Copy the modified scanline back */
2251 neon_quadword_copy (dst_line, scan_line + copy_offset,
2252 width >> 3, (width & 7) * 2);
2258 #ifdef USE_GCC_INLINE_ASM
2261 plain_over_565_8_pix_neon (uint32_t colour,
2263 uint32_t dest_stride, /* bytes, not elements */
2264 uint32_t count /* 8-pixel groups */)
2266 /* Inner loop for plain translucent rects
2267 * (solid colour without alpha mask)
2270 " vld4.8 {d20[], d21[], d22[], d23[]}, [%[colour]] @ solid colour load/splat \n"
2271 " vmull.u8 q12, d23, d22 @ premultiply alpha red \n"
2272 " vmull.u8 q13, d23, d21 @ premultiply alpha green \n"
2273 " vmull.u8 q14, d23, d20 @ premultiply alpha blue \n"
2274 " vmvn d18, d23 @ inverse alpha for background \n"
2276 " vld1.16 {d0, d1}, [%[dest]] @ load first pixels from framebuffer \n"
2277 " vshrn.u16 d2, q0, #8 @ unpack red from framebuffer pixels \n"
2278 " vshrn.u16 d4, q0, #3 @ unpack green \n"
2279 " vsli.u16 q3, q0, #5 @ duplicate framebuffer blue bits \n"
2280 " vsri.u8 d2, d2, #5 @ duplicate red bits (extend 5 to 8) \n"
2281 " vsri.u8 d4, d4, #6 @ duplicate green bits (extend 6 to 8) \n"
2282 " vshrn.u16 d6, q3, #2 @ unpack extended blue (truncate 10 to 8) \n"
2283 " vmov q0, q12 @ retrieve foreground red \n"
2284 " vmlal.u8 q0, d2, d18 @ blend red - my kingdom for a four-operand MLA \n"
2285 " vmov q1, q13 @ retrieve foreground green \n"
2286 " vmlal.u8 q1, d4, d18 @ blend green \n"
2287 " vmov q2, q14 @ retrieve foreground blue \n"
2288 " vmlal.u8 q2, d6, d18 @ blend blue \n"
2289 " subs %[count], %[count], #1 @ decrement/test loop counter \n"
2290 " vsri.16 q0, q1, #5 @ pack green behind red \n"
2291 " vsri.16 q0, q2, #11 @ pack blue into pixels \n"
2292 " vst1.16 {d0, d1}, [%[dest]] @ store composited pixels \n"
2293 " add %[dest], %[dest], %[dest_stride] @ advance framebuffer pointer \n"
2294 " bne 0b @ next please \n"
2296 /* Clobbered registers marked as input/outputs */
2297 : [dest] "+r" (dest), [count] "+r" (count)
2300 : [dest_stride] "r" (dest_stride), [colour] "r" (&colour)
2302 /* Clobbers, including the inputs we modify, and
2303 * potentially lots of memory
2305 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d18", "d19",
2306 "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
2312 neon_composite_over_n_0565 (pixman_implementation_t * impl,
2314 pixman_image_t * src_image,
2315 pixman_image_t * mask_image,
2316 pixman_image_t * dst_image,
2327 uint16_t *dst_line, *aligned_line;
2328 uint32_t dst_stride;
2329 uint32_t kernel_count, copy_count, copy_tail;
2330 uint8_t kernel_offset, copy_offset;
2332 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2334 /* bail out if fully transparent */
2339 if (width == 0 || height == 0)
2342 if (width > NEON_SCANLINE_BUFFER_PIXELS)
2344 /* split the blit, so we can use a fixed-size scanline buffer *
2345 * TODO: there must be a more elegant way of doing this.
2349 for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
2351 neon_composite_over_n_0565 (
2353 src_image, mask_image, dst_image,
2354 src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
2355 (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
2360 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2362 /* keep within minimum number of aligned quadwords on width
2363 * while also keeping the minimum number of columns to process
2366 unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
2367 unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
2368 unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
2370 /* the fast copy should be quadword aligned */
2371 copy_offset = dst_line - ((uint16_t*) aligned_left);
2372 aligned_line = dst_line - copy_offset;
2373 copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
2376 if (aligned_right - aligned_left > ceiling_length)
2378 /* unaligned routine is tightest */
2379 kernel_count = (uint32_t) (ceiling_length >> 4);
2380 kernel_offset = copy_offset;
2384 /* aligned routine is equally tight, so it is safer to align */
2385 kernel_count = copy_count;
2389 /* We should avoid reading beyond scanline ends for safety */
2390 if (aligned_line < (dst_line - dest_x) ||
2391 (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
2393 /* switch to precise read */
2394 copy_offset = kernel_offset = 0;
2395 aligned_line = dst_line;
2396 kernel_count = (uint32_t) (ceiling_length >> 4);
2397 copy_count = (width * sizeof(*dst_line)) >> 4;
2398 copy_tail = (width * sizeof(*dst_line)) & 0xF;
2403 uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8]; /* deliberately not initialised */
2405 /* row-major order */
2406 /* left edge, middle block, right edge */
2407 for ( ; height--; aligned_line += dst_stride, dst_line += dst_stride)
2409 /* Uncached framebuffer access is really, really slow if we do it piecemeal.
2410 * It should be much faster if we grab it all at once.
2411 * One scanline should easily fit in L1 cache, so this should
2412 * not waste RAM bandwidth.
2414 neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
2416 /* Apply the actual filter */
2417 plain_over_565_8_pix_neon (
2418 src, scan_line + kernel_offset, 8 * sizeof(*dst_line), kernel_count);
2420 /* Copy the modified scanline back */
2421 neon_quadword_copy (
2422 dst_line, scan_line + copy_offset, width >> 3, (width & 7) * 2);
2428 ARGB8_over_565_8_pix_neon (uint32_t *src,
2430 uint32_t src_stride, /* bytes, not elements */
2431 uint32_t count /* 8-pixel groups */)
2435 " pld [%[src], %[src_stride]] @ preload from next scanline \n"
2436 " vld1.16 {d0, d1}, [%[dest]] @ load pixels from framebuffer \n"
2437 " vld4.8 {d20, d21, d22, d23},[%[src]]! @ load source image pixels \n"
2438 " vsli.u16 q3, q0, #5 @ duplicate framebuffer blue bits \n"
2439 " vshrn.u16 d2, q0, #8 @ unpack red from framebuffer pixels \n"
2440 " vshrn.u16 d4, q0, #3 @ unpack green \n"
2441 " vmvn d18, d23 @ we need the inverse alpha for the background \n"
2442 " vsri.u8 d2, d2, #5 @ duplicate red bits (extend 5 to 8) \n"
2443 " vshrn.u16 d6, q3, #2 @ unpack extended blue (truncate 10 to 8) \n"
2444 " vsri.u8 d4, d4, #6 @ duplicate green bits (extend 6 to 8) \n"
2445 " vmull.u8 q1, d2, d18 @ apply inverse alpha to background red... \n"
2446 " vmull.u8 q2, d4, d18 @ ...green... \n"
2447 " vmull.u8 q3, d6, d18 @ ...blue \n"
2448 " subs %[count], %[count], #1 @ decrement/test loop counter \n"
2449 " vmlal.u8 q1, d23, d22 @ add blended foreground red... \n"
2450 " vmlal.u8 q2, d23, d21 @ ...green... \n"
2451 " vmlal.u8 q3, d23, d20 @ ...blue \n"
2452 " vsri.16 q1, q2, #5 @ pack green behind red \n"
2453 " vsri.16 q1, q3, #11 @ pack blue into pixels \n"
2454 " vst1.16 {d2, d3}, [%[dest]]! @ store composited pixels \n"
2455 " bne 0b @ next please \n"
2457 /* Clobbered registers marked as input/outputs */
2458 : [dest] "+r" (dest), [src] "+r" (src), [count] "+r" (count)
2461 : [src_stride] "r" (src_stride)
2463 /* Clobbers, including the inputs we modify, and potentially lots of memory */
2464 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d17", "d18", "d20",
2465 "d21", "d22", "d23", "cc", "memory"
2470 neon_composite_over_8888_0565 (pixman_implementation_t * impl,
2472 pixman_image_t * src_image,
2473 pixman_image_t * mask_image,
2474 pixman_image_t * dst_image,
2485 uint16_t *dst_line, *aligned_line;
2486 uint32_t dst_stride, src_stride;
2487 uint32_t kernel_count, copy_count, copy_tail;
2488 uint8_t kernel_offset, copy_offset;
2490 /* we assume mask is opaque
2491 * so the only alpha to deal with is embedded in src
2493 if (width > NEON_SCANLINE_BUFFER_PIXELS)
2495 /* split the blit, so we can use a fixed-size scanline buffer */
2497 for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
2499 neon_composite_over_8888_0565 (
2501 src_image, mask_image, dst_image,
2502 src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
2503 (x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
2508 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2509 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2511 /* keep within minimum number of aligned quadwords on width
2512 * while also keeping the minimum number of columns to process
2515 unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
2516 unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
2517 unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
2519 /* the fast copy should be quadword aligned */
2520 copy_offset = dst_line - ((uint16_t*) aligned_left);
2521 aligned_line = dst_line - copy_offset;
2522 copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
2525 if (aligned_right - aligned_left > ceiling_length)
2527 /* unaligned routine is tightest */
2528 kernel_count = (uint32_t) (ceiling_length >> 4);
2529 kernel_offset = copy_offset;
2533 /* aligned routine is equally tight, so it is safer to align */
2534 kernel_count = copy_count;
2538 /* We should avoid reading beyond scanline ends for safety */
2539 if (aligned_line < (dst_line - dest_x) ||
2540 (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
2542 /* switch to precise read */
2543 copy_offset = kernel_offset = 0;
2544 aligned_line = dst_line;
2545 kernel_count = (uint32_t) (ceiling_length >> 4);
2546 copy_count = (width * sizeof(*dst_line)) >> 4;
2547 copy_tail = (width * sizeof(*dst_line)) & 0xF;
2551 /* Preload the first input scanline */
2553 uint8_t *src_ptr = (uint8_t*) src_line;
2554 uint32_t count = (width + 15) / 16;
2556 #ifdef USE_GCC_INLINE_ASM
2559 " subs %[count], %[count], #1 \n"
2561 " add %[src], %[src], #64 \n"
2564 /* Clobbered input registers marked as input/outputs */
2565 : [src] "+r" (src_ptr), [count] "+r" (count)
2566 : /* no unclobbered inputs */
2580 uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8]; /* deliberately not initialised */
2582 /* row-major order */
2583 /* left edge, middle block, right edge */
2584 for ( ; height--; src_line += src_stride, aligned_line += dst_stride)
2586 /* Uncached framebuffer access is really, really slow if we do
2587 * it piecemeal. It should be much faster if we grab it all at
2588 * once. One scanline should easily fit in L1 cache, so this
2589 * should not waste RAM bandwidth.
2591 neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
2593 /* Apply the actual filter */
2594 ARGB8_over_565_8_pix_neon (
2595 src_line, scan_line + kernel_offset,
2596 src_stride * sizeof(*src_line), kernel_count);
2598 /* Copy the modified scanline back */
2599 neon_quadword_copy (dst_line,
2600 scan_line + copy_offset,
2601 width >> 3, (width & 7) * 2);
2606 #endif /* USE_GCC_INLINE_ASM */
2608 static const pixman_fast_path_t arm_neon_fast_path_array[] =
2610 { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, neon_composite_add_8888_8_8, 0 },
2611 { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, neon_composite_add_8000_8000, 0 },
2612 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, neon_composite_over_n_8_0565, 0 },
2613 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, neon_composite_over_n_8_0565, 0 },
2614 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_src_24_16, 0 },
2615 { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_src_24_16, 0 },
2616 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_src_24_16, 0 },
2617 { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_src_24_16, 0 },
2618 #ifdef USE_GCC_INLINE_ASM
2619 { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_src_16_16, 0 },
2620 { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_src_16_16, 0 },
2621 #if 0 /* this code has some bugs */
2622 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_over_n_0565, 0 },
2623 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_over_n_0565, 0 },
2624 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_over_8888_0565, 0 },
2625 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_over_8888_0565, 0 },
2628 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, neon_composite_over_8888_8888, 0 },
2629 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, neon_composite_over_8888_8888, 0 },
2630 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, neon_composite_over_8888_8888, 0 },
2631 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, neon_composite_over_8888_8888, 0 },
2632 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
2633 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
2634 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, neon_composite_over_n_8_8888, 0 },
2635 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, neon_composite_over_n_8_8888, 0 },
2636 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, neon_composite_over_n_8_8888, 0 },
2637 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, neon_composite_over_n_8_8888, 0 },
2641 const pixman_fast_path_t *const arm_neon_fast_paths = arm_neon_fast_path_array;
2644 arm_neon_composite (pixman_implementation_t *imp,
2646 pixman_image_t * src,
2647 pixman_image_t * mask,
2648 pixman_image_t * dest,
2658 if (_pixman_run_fast_path (arm_neon_fast_paths, imp,
2659 op, src, mask, dest,
2668 _pixman_implementation_composite (imp->delegate, op,
2676 static pixman_bool_t
2677 pixman_blt_neon (void *src_bits,
2690 if (!width || !height)
2693 /* accelerate only straight copies involving complete bytes */
2694 if (src_bpp != dst_bpp || (src_bpp & 7))
2698 uint32_t bytes_per_pixel = src_bpp >> 3;
2699 uint32_t byte_width = width * bytes_per_pixel;
2700 /* parameter is in words for some reason */
2701 int32_t src_stride_bytes = src_stride * 4;
2702 int32_t dst_stride_bytes = dst_stride * 4;
2703 uint8_t *src_bytes = ((uint8_t*) src_bits) +
2704 src_y * src_stride_bytes + src_x * bytes_per_pixel;
2705 uint8_t *dst_bytes = ((uint8_t*) dst_bits) +
2706 dst_y * dst_stride_bytes + dst_x * bytes_per_pixel;
2707 uint32_t quadword_count = byte_width / 16;
2708 uint32_t offset = byte_width % 16;
2712 neon_quadword_copy (dst_bytes, src_bytes, quadword_count, offset);
2713 src_bytes += src_stride_bytes;
2714 dst_bytes += dst_stride_bytes;
2721 static pixman_bool_t
2722 arm_neon_blt (pixman_implementation_t *imp,
2723 uint32_t * src_bits,
2724 uint32_t * dst_bits,
2736 if (pixman_blt_neon (
2737 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
2738 src_x, src_y, dst_x, dst_y, width, height))
2743 return _pixman_implementation_blt (
2745 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
2746 src_x, src_y, dst_x, dst_y, width, height);
2749 static pixman_bool_t
2750 arm_neon_fill (pixman_implementation_t *imp,
2760 if (pixman_fill_neon (bits, stride, bpp, x, y, width, height, xor))
2763 return _pixman_implementation_fill (
2764 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
2767 pixman_implementation_t *
2768 _pixman_implementation_create_arm_neon (void)
2770 pixman_implementation_t *simd = _pixman_implementation_create_arm_simd ();
2771 pixman_implementation_t *imp = _pixman_implementation_create (simd);
2773 imp->composite = arm_neon_composite;
2774 #if 0 /* this code has some bugs */
2775 imp->blt = arm_neon_blt;
2777 imp->fill = arm_neon_fill;