2 * Copyright © 2009 ARM Ltd, Movial Creative Technologies Oy
4 * Permission to use, copy, modify, distribute, and sell this software and its
5 * documentation for any purpose is hereby granted without fee, provided that
6 * the above copyright notice appear in all copies and that both that
7 * copyright notice and this permission notice appear in supporting
8 * documentation, and that the name of ARM Ltd not be used in
9 * advertising or publicity pertaining to distribution of the software without
10 * specific, written prior permission. ARM Ltd makes no
11 * representations about the suitability of this software for any purpose. It
12 * is provided "as is" without express or implied warranty.
14 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
15 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
16 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
17 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
19 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
20 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23 * Author: Ian Rickards (ian.rickards@arm.com)
24 * Author: Jonathan Morton (jonathan.morton@movial.com)
25 * Author: Markku Vire (markku.vire@movial.com)
35 #include "pixman-private.h"
37 /* Deal with an intrinsic that is defined differently in GCC */
38 #if !defined(__ARMCC_VERSION) && !defined(__pld)
39 #define __pld(_x) __builtin_prefetch (_x)
42 static force_inline uint8x8x4_t
43 unpack0565 (uint16x8_t rgb)
48 res.val[3] = vdup_n_u8 (0);
49 gb = vshrq_n_u16 (rgb, 5);
50 b = vshrq_n_u16 (rgb, 5 + 6);
52 res.val[0] = vmovn_u16 (rgb); /* get low 5 bits */
53 res.val[1] = vmovn_u16 (gb); /* get mid 6 bits */
54 res.val[2] = vmovn_u16 (b); /* get top 5 bits */
56 res.val[0] = vshl_n_u8 (res.val[0], 3); /* shift to top */
57 res.val[1] = vshl_n_u8 (res.val[1], 2); /* shift to top */
58 res.val[2] = vshl_n_u8 (res.val[2], 3); /* shift to top */
60 res.val[0] = vsri_n_u8 (res.val[0], res.val[0], 5);
61 res.val[1] = vsri_n_u8 (res.val[1], res.val[1], 6);
62 res.val[2] = vsri_n_u8 (res.val[2], res.val[2], 5);
67 #ifdef USE_GCC_INLINE_ASM
68 /* Some versions of gcc have problems with vshll_n_u8 intrinsic (Bug 23576) */
69 #define vshll_n_u8(a, n) ({ uint16x8_t r; \
70 asm ("vshll.u8 %q0, %P1, %2\n" : "=w" (r) : "w" (a), "i" (n)); r; })
73 static force_inline uint16x8_t
74 pack0565 (uint8x8x4_t s)
76 uint16x8_t rgb, val_g, val_r;
78 rgb = vshll_n_u8 (s.val[2], 8);
79 val_g = vshll_n_u8 (s.val[1], 8);
80 val_r = vshll_n_u8 (s.val[0], 8);
81 rgb = vsriq_n_u16 (rgb, val_g, 5);
82 rgb = vsriq_n_u16 (rgb, val_r, 5 + 6);
87 static force_inline uint8x8_t
88 neon2mul (uint8x8_t x,
94 tmp = vmull_u8 (x, alpha);
95 tmp2 = vrshrq_n_u16 (tmp, 8);
96 res = vraddhn_u16 (tmp, tmp2);
101 static force_inline uint8x8x4_t
102 neon8mul (uint8x8x4_t x,
107 uint16x8_t qtmp1, qtmp2;
109 tmp.val[0] = vmull_u8 (x.val[0], alpha);
110 tmp.val[1] = vmull_u8 (x.val[1], alpha);
111 tmp.val[2] = vmull_u8 (x.val[2], alpha);
112 tmp.val[3] = vmull_u8 (x.val[3], alpha);
114 qtmp1 = vrshrq_n_u16 (tmp.val[0], 8);
115 qtmp2 = vrshrq_n_u16 (tmp.val[1], 8);
116 res.val[0] = vraddhn_u16 (tmp.val[0], qtmp1);
117 qtmp1 = vrshrq_n_u16 (tmp.val[2], 8);
118 res.val[1] = vraddhn_u16 (tmp.val[1], qtmp2);
119 qtmp2 = vrshrq_n_u16 (tmp.val[3], 8);
120 res.val[2] = vraddhn_u16 (tmp.val[2], qtmp1);
121 res.val[3] = vraddhn_u16 (tmp.val[3], qtmp2);
126 static force_inline uint8x8x4_t
127 neon8qadd (uint8x8x4_t x,
132 res.val[0] = vqadd_u8 (x.val[0], y.val[0]);
133 res.val[1] = vqadd_u8 (x.val[1], y.val[1]);
134 res.val[2] = vqadd_u8 (x.val[2], y.val[2]);
135 res.val[3] = vqadd_u8 (x.val[3], y.val[3]);
141 neon_composite_add_8000_8000 (pixman_implementation_t * impl,
143 pixman_image_t * src_image,
144 pixman_image_t * mask_image,
145 pixman_image_t * dst_image,
155 uint8_t *dst_line, *dst;
156 uint8_t *src_line, *src;
157 int dst_stride, src_stride;
160 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
161 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
165 /* Use overlapping 8-pixel method */
168 uint8_t *keep_dst = 0;
169 uint8x8_t sval, dval, temp;
172 dst_line += dst_stride;
174 src_line += src_stride;
177 #ifndef USE_GCC_INLINE_ASM
178 sval = vld1_u8 ((void *)src);
179 dval = vld1_u8 ((void *)dst);
182 temp = vqadd_u8 (dval, sval);
190 sval = vld1_u8 ((void *)src);
191 dval = vld1_u8 ((void *)dst);
193 vst1_u8 ((void *)keep_dst, temp);
196 temp = vqadd_u8 (dval, sval);
203 vst1_u8 ((void *)keep_dst, temp);
206 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
207 "vld1.8 {d0}, [%[src]]\n\t"
208 "vld1.8 {d4}, [%[dst]]\n\t"
209 "mov %[keep_dst], %[dst]\n\t"
211 "and ip, %[w], #7\n\t"
212 "add %[src], %[src], ip\n\t"
213 "add %[dst], %[dst], ip\n\t"
214 "subs %[w], %[w], ip\n\t"
218 "vld1.8 {d0}, [%[src]]!\n\t"
219 "vld1.8 {d4}, [%[dst]]!\n\t"
220 "vst1.8 {d20}, [%[keep_dst]]\n\t"
221 "sub %[keep_dst], %[dst], #8\n\t"
222 "subs %[w], %[w], #8\n\t"
224 "vqadd.u8 d20, d0, d4\n\t"
229 "vst1.8 {d20}, [%[keep_dst]]\n\t"
231 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
233 : "ip", "cc", "memory", "d0", "d4",
241 const uint8_t nil = 0;
242 const uint8x8_t vnil = vld1_dup_u8 (&nil);
246 uint8x8_t sval = vnil, dval = vnil;
247 uint8_t *dst4 = 0, *dst2 = 0;
250 dst_line += dst_stride;
252 src_line += src_stride;
257 sval = vreinterpret_u8_u32 (
258 vld1_lane_u32 ((void *)src, vreinterpret_u32_u8 (sval), 1));
259 dval = vreinterpret_u8_u32 (
260 vld1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (dval), 1));
269 sval = vreinterpret_u8_u16 (
270 vld1_lane_u16 ((void *)src, vreinterpret_u16_u8 (sval), 1));
271 dval = vreinterpret_u8_u16 (
272 vld1_lane_u16 ((void *)dst, vreinterpret_u16_u8 (dval), 1));
281 sval = vld1_lane_u8 (src, sval, 1);
282 dval = vld1_lane_u8 (dst, dval, 1);
285 dval = vqadd_u8 (dval, sval);
288 vst1_lane_u8 (dst, dval, 1);
291 vst1_lane_u16 ((void *)dst2, vreinterpret_u16_u8 (dval), 1);
294 vst1_lane_u32 ((void *)dst4, vreinterpret_u32_u8 (dval), 1);
300 neon_composite_over_8888_8888 (pixman_implementation_t * impl,
302 pixman_image_t * src_image,
303 pixman_image_t * mask_image,
304 pixman_image_t * dst_image,
314 uint32_t *dst_line, *dst;
315 uint32_t *src_line, *src;
316 int dst_stride, src_stride;
319 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
320 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
324 /* Use overlapping 8-pixel method */
327 uint32_t *keep_dst = 0;
328 uint8x8x4_t sval, dval, temp;
331 dst_line += dst_stride;
333 src_line += src_stride;
336 #ifndef USE_GCC_INLINE_ASM
337 sval = vld4_u8 ((void *)src);
338 dval = vld4_u8 ((void *)dst);
341 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
342 temp = neon8qadd (sval, temp);
350 sval = vld4_u8 ((void *)src);
351 dval = vld4_u8 ((void *)dst);
353 vst4_u8 ((void *)keep_dst, temp);
356 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
357 temp = neon8qadd (sval, temp);
364 vst4_u8 ((void *)keep_dst, temp);
367 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
368 "vld4.8 {d0-d3}, [%[src]]\n\t"
369 "vld4.8 {d4-d7}, [%[dst]]\n\t"
370 "mov %[keep_dst], %[dst]\n\t"
372 "and ip, %[w], #7\n\t"
373 "add %[src], %[src], ip, LSL#2\n\t"
374 "add %[dst], %[dst], ip, LSL#2\n\t"
375 "subs %[w], %[w], ip\n\t"
379 "vld4.8 {d0-d3}, [%[src]]!\n\t"
380 "vld4.8 {d4-d7}, [%[dst]]!\n\t"
381 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
382 "sub %[keep_dst], %[dst], #8*4\n\t"
383 "subs %[w], %[w], #8\n\t"
386 "vmull.u8 q10, d31, d4\n\t"
387 "vmull.u8 q11, d31, d5\n\t"
388 "vmull.u8 q12, d31, d6\n\t"
389 "vmull.u8 q13, d31, d7\n\t"
390 "vrshr.u16 q8, q10, #8\n\t"
391 "vrshr.u16 q9, q11, #8\n\t"
392 "vraddhn.u16 d20, q10, q8\n\t"
393 "vraddhn.u16 d21, q11, q9\n\t"
394 "vrshr.u16 q8, q12, #8\n\t"
395 "vrshr.u16 q9, q13, #8\n\t"
396 "vraddhn.u16 d22, q12, q8\n\t"
397 "vraddhn.u16 d23, q13, q9\n\t"
398 /* result in d20-d23 */
399 "vqadd.u8 d20, d0, d20\n\t"
400 "vqadd.u8 d21, d1, d21\n\t"
401 "vqadd.u8 d22, d2, d22\n\t"
402 "vqadd.u8 d23, d3, d23\n\t"
407 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
409 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
411 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
412 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23"
419 uint8x8_t alpha_selector = vreinterpret_u8_u64 (
420 vcreate_u64 (0x0707070703030303ULL));
422 /* Handle width < 8 */
426 dst_line += dst_stride;
428 src_line += src_stride;
433 uint8x8_t sval, dval;
435 /* two 32-bit pixels packed into D-reg; ad-hoc vectorization */
436 sval = vreinterpret_u8_u32 (vld1_u32 ((void *)src));
437 dval = vreinterpret_u8_u32 (vld1_u32 ((void *)dst));
438 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
439 vst1_u8 ((void *)dst, vqadd_u8 (sval, dval));
448 uint8x8_t sval, dval;
450 /* single 32-bit pixel in lane 0 */
451 sval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)src)); /* only interested in lane 0 */
452 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)dst)); /* only interested in lane 0 */
453 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
454 vst1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
461 neon_composite_over_8888_n_8888 (pixman_implementation_t * impl,
463 pixman_image_t * src_image,
464 pixman_image_t * mask_image,
465 pixman_image_t * dst_image,
475 uint32_t *dst_line, *dst;
476 uint32_t *src_line, *src;
478 int dst_stride, src_stride;
480 uint8x8_t mask_alpha;
482 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
483 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
485 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
486 mask_alpha = vdup_n_u8 ((mask) >> 24);
490 /* Use overlapping 8-pixel method */
494 dst_line += dst_stride;
496 src_line += src_stride;
499 uint32_t *keep_dst = 0;
501 #ifndef USE_GCC_INLINE_ASM
502 uint8x8x4_t sval, dval, temp;
504 sval = vld4_u8 ((void *)src);
505 dval = vld4_u8 ((void *)dst);
508 sval = neon8mul (sval, mask_alpha);
509 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
510 temp = neon8qadd (sval, temp);
518 sval = vld4_u8 ((void *)src);
519 dval = vld4_u8 ((void *)dst);
521 vst4_u8 ((void *)keep_dst, temp);
524 sval = neon8mul (sval, mask_alpha);
525 temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
526 temp = neon8qadd (sval, temp);
532 vst4_u8 ((void *)keep_dst, temp);
535 /* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
536 "vdup.32 d30, %[mask]\n\t"
537 "vdup.8 d30, d30[3]\n\t"
539 "vld4.8 {d0-d3}, [%[src]]\n\t"
540 "vld4.8 {d4-d7}, [%[dst]]\n\t"
541 "mov %[keep_dst], %[dst]\n\t"
543 "and ip, %[w], #7\n\t"
544 "add %[src], %[src], ip, LSL#2\n\t"
545 "add %[dst], %[dst], ip, LSL#2\n\t"
546 "subs %[w], %[w], ip\n\t"
550 "vld4.8 {d0-d3}, [%[src]]!\n\t"
551 "vld4.8 {d4-d7}, [%[dst]]!\n\t"
552 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
553 "sub %[keep_dst], %[dst], #8*4\n\t"
554 "subs %[w], %[w], #8\n\t"
557 "vmull.u8 q10, d30, d0\n\t"
558 "vmull.u8 q11, d30, d1\n\t"
559 "vmull.u8 q12, d30, d2\n\t"
560 "vmull.u8 q13, d30, d3\n\t"
561 "vrshr.u16 q8, q10, #8\n\t"
562 "vrshr.u16 q9, q11, #8\n\t"
563 "vraddhn.u16 d0, q10, q8\n\t"
564 "vraddhn.u16 d1, q11, q9\n\t"
565 "vrshr.u16 q9, q13, #8\n\t"
566 "vrshr.u16 q8, q12, #8\n\t"
567 "vraddhn.u16 d3, q13, q9\n\t"
568 "vraddhn.u16 d2, q12, q8\n\t"
571 "vmull.u8 q10, d31, d4\n\t"
572 "vmull.u8 q11, d31, d5\n\t"
573 "vmull.u8 q12, d31, d6\n\t"
574 "vmull.u8 q13, d31, d7\n\t"
575 "vrshr.u16 q8, q10, #8\n\t"
576 "vrshr.u16 q9, q11, #8\n\t"
577 "vraddhn.u16 d20, q10, q8\n\t"
578 "vrshr.u16 q8, q12, #8\n\t"
579 "vraddhn.u16 d21, q11, q9\n\t"
580 "vrshr.u16 q9, q13, #8\n\t"
581 "vraddhn.u16 d22, q12, q8\n\t"
582 "vraddhn.u16 d23, q13, q9\n\t"
584 /* result in d20-d23 */
585 "vqadd.u8 d20, d0, d20\n\t"
586 "vqadd.u8 d21, d1, d21\n\t"
587 "vqadd.u8 d22, d2, d22\n\t"
588 "vqadd.u8 d23, d3, d23\n\t"
593 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
595 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
597 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
598 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
606 uint8x8_t alpha_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0707070703030303ULL));
608 /* Handle width < 8 */
612 dst_line += dst_stride;
614 src_line += src_stride;
619 uint8x8_t sval, dval;
621 sval = vreinterpret_u8_u32 (vld1_u32 ((void *)src));
622 dval = vreinterpret_u8_u32 (vld1_u32 ((void *)dst));
624 /* sval * const alpha_mul */
625 sval = neon2mul (sval, mask_alpha);
627 /* dval * 255-(src alpha) */
628 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
630 vst1_u8 ((void *)dst, vqadd_u8 (sval, dval));
639 uint8x8_t sval, dval;
641 sval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)src));
642 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)dst));
644 /* sval * const alpha_mul */
645 sval = neon2mul (sval, mask_alpha);
647 /* dval * 255-(src alpha) */
648 dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
650 vst1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
657 neon_composite_over_n_8_0565 (pixman_implementation_t * impl,
659 pixman_image_t * src_image,
660 pixman_image_t * mask_image,
661 pixman_image_t * dst_image,
672 uint16_t *dst_line, *dst;
673 uint8_t *mask_line, *mask;
674 int dst_stride, mask_stride;
679 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
685 sval2=vreinterpret_u8_u32 (vdup_n_u32 (src));
686 sval8.val[0]=vdup_lane_u8 (sval2,0);
687 sval8.val[1]=vdup_lane_u8 (sval2,1);
688 sval8.val[2]=vdup_lane_u8 (sval2,2);
689 sval8.val[3]=vdup_lane_u8 (sval2,3);
691 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
692 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
696 /* Use overlapping 8-pixel method, modified to avoid rewritten dest being reused */
699 uint16_t *keep_dst=0;
702 dst_line += dst_stride;
704 mask_line += mask_stride;
707 #ifndef USE_GCC_INLINE_ASM
709 uint16x8_t dval, temp;
710 uint8x8x4_t sval8temp;
712 alpha = vld1_u8 ((void *)mask);
713 dval = vld1q_u16 ((void *)dst);
716 sval8temp = neon8mul (sval8, alpha);
717 temp = pack0565 (neon8qadd (sval8temp, neon8mul (unpack0565 (dval), vmvn_u8 (sval8temp.val[3]))));
725 dval = vld1q_u16 ((void *)dst);
726 alpha = vld1_u8 ((void *)mask);
728 vst1q_u16 ((void *)keep_dst, temp);
731 sval8temp = neon8mul (sval8, alpha);
732 temp = pack0565 (neon8qadd (sval8temp, neon8mul (unpack0565 (dval), vmvn_u8 (sval8temp.val[3]))));
738 vst1q_u16 ((void *)keep_dst, temp);
741 "vdup.32 d0, %[src]\n\t"
742 "vdup.8 d1, d0[1]\n\t"
743 "vdup.8 d2, d0[2]\n\t"
744 "vdup.8 d3, d0[3]\n\t"
745 "vdup.8 d0, d0[0]\n\t"
747 "vld1.8 {q12}, [%[dst]]\n\t"
748 "vld1.8 {d31}, [%[mask]]\n\t"
749 "mov %[keep_dst], %[dst]\n\t"
751 "and ip, %[w], #7\n\t"
752 "add %[mask], %[mask], ip\n\t"
753 "add %[dst], %[dst], ip, LSL#1\n\t"
754 "subs %[w], %[w], ip\n\t"
759 "vld1.16 {q12}, [%[dst]]!\n\t"
760 "vld1.8 {d31}, [%[mask]]!\n\t"
761 "vst1.16 {q10}, [%[keep_dst]]\n\t"
762 "sub %[keep_dst], %[dst], #8*2\n\t"
763 "subs %[w], %[w], #8\n\t"
765 /* expand 0565 q12 to 8888 {d4-d7} */
766 "vmovn.u16 d4, q12\t\n"
767 "vshr.u16 q11, q12, #5\t\n"
768 "vshr.u16 q10, q12, #6+5\t\n"
769 "vmovn.u16 d5, q11\t\n"
770 "vmovn.u16 d6, q10\t\n"
771 "vshl.u8 d4, d4, #3\t\n"
772 "vshl.u8 d5, d5, #2\t\n"
773 "vshl.u8 d6, d6, #3\t\n"
774 "vsri.u8 d4, d4, #5\t\n"
775 "vsri.u8 d5, d5, #6\t\n"
776 "vsri.u8 d6, d6, #5\t\n"
778 "vmull.u8 q10, d31, d0\n\t"
779 "vmull.u8 q11, d31, d1\n\t"
780 "vmull.u8 q12, d31, d2\n\t"
781 "vmull.u8 q13, d31, d3\n\t"
782 "vrshr.u16 q8, q10, #8\n\t"
783 "vrshr.u16 q9, q11, #8\n\t"
784 "vraddhn.u16 d20, q10, q8\n\t"
785 "vraddhn.u16 d21, q11, q9\n\t"
786 "vrshr.u16 q9, q13, #8\n\t"
787 "vrshr.u16 q8, q12, #8\n\t"
788 "vraddhn.u16 d23, q13, q9\n\t"
789 "vraddhn.u16 d22, q12, q8\n\t"
791 /* duplicate in 4/2/1 & 8pix vsns */
792 "vmvn.8 d30, d23\n\t"
793 "vmull.u8 q14, d30, d6\n\t"
794 "vmull.u8 q13, d30, d5\n\t"
795 "vmull.u8 q12, d30, d4\n\t"
796 "vrshr.u16 q8, q14, #8\n\t"
797 "vrshr.u16 q9, q13, #8\n\t"
798 "vraddhn.u16 d6, q14, q8\n\t"
799 "vrshr.u16 q8, q12, #8\n\t"
800 "vraddhn.u16 d5, q13, q9\n\t"
801 "vqadd.u8 d6, d6, d22\n\t" /* moved up */
802 "vraddhn.u16 d4, q12, q8\n\t"
803 /* intentionally don't calculate alpha */
804 /* result in d4-d6 */
806 /* "vqadd.u8 d6, d6, d22\n\t" ** moved up */
807 "vqadd.u8 d5, d5, d21\n\t"
808 "vqadd.u8 d4, d4, d20\n\t"
810 /* pack 8888 {d20-d23} to 0565 q10 */
811 "vshll.u8 q10, d6, #8\n\t"
812 "vshll.u8 q3, d5, #8\n\t"
813 "vshll.u8 q2, d4, #8\n\t"
814 "vsri.u16 q10, q3, #5\t\n"
815 "vsri.u16 q10, q2, #11\t\n"
820 "vst1.16 {q10}, [%[keep_dst]]\n\t"
822 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
824 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
825 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
835 void *dst4=0, *dst2=0;
838 dst_line += dst_stride;
840 mask_line += mask_stride;
844 #if 1 /* #ifndef USE_GCC_INLINE_ASM */
846 uint16x8_t dval, temp;
847 uint8x8x4_t sval8temp;
851 alpha = vreinterpret_u8_u32 (vld1_lane_u32 ((void *)mask, vreinterpret_u32_u8 (alpha),1));
852 dval = vreinterpretq_u16_u64 (vld1q_lane_u64 ((void *)dst, vreinterpretq_u64_u16 (dval),1));
859 alpha = vreinterpret_u8_u16 (vld1_lane_u16 ((void *)mask, vreinterpret_u16_u8 (alpha),1));
860 dval = vreinterpretq_u16_u32 (vld1q_lane_u32 ((void *)dst, vreinterpretq_u32_u16 (dval),1));
867 alpha = vld1_lane_u8 ((void *)mask, alpha,1);
868 dval = vld1q_lane_u16 ((void *)dst, dval,1);
871 sval8temp = neon8mul (sval8, alpha);
872 temp = pack0565 (neon8qadd (sval8temp, neon8mul (unpack0565 (dval), vmvn_u8 (sval8temp.val[3]))));
875 vst1q_lane_u16 ((void *)dst, temp,1);
877 vst1q_lane_u32 ((void *)dst2, vreinterpretq_u32_u16 (temp),1);
879 vst1q_lane_u64 ((void *)dst4, vreinterpretq_u64_u16 (temp),1);
881 /* this code has some bug (does not pass blitters-test) */
883 "vdup.32 d0, %[src]\n\t"
884 "vdup.8 d1, d0[1]\n\t"
885 "vdup.8 d2, d0[2]\n\t"
886 "vdup.8 d3, d0[3]\n\t"
887 "vdup.8 d0, d0[0]\n\t"
892 "vld1.64 {d25}, [%[dst]]\n\t"
893 "vld1.32 {d31[1]}, [%[mask]]\n\t"
894 "mov %[dst4], %[dst]\t\n"
895 "add %[mask], %[mask], #4\t\n"
896 "add %[dst], %[dst], #4*2\t\n"
901 "vld1.32 {d24[1]}, [%[dst]]\n\t"
902 "vld1.16 {d31[1]}, [%[mask]]\n\t"
903 "mov %[dst2], %[dst]\t\n"
904 "add %[mask], %[mask], #2\t\n"
905 "add %[dst], %[dst], #2*2\t\n"
910 "vld1.16 {d24[1]}, [%[dst]]\n\t"
911 "vld1.8 {d31[1]}, [%[mask]]\n\t"
914 /* expand 0565 q12 to 8888 {d4-d7} */
915 "vmovn.u16 d4, q12\t\n"
916 "vshr.u16 q11, q12, #5\t\n"
917 "vshr.u16 q10, q12, #6+5\t\n"
918 "vmovn.u16 d5, q11\t\n"
919 "vmovn.u16 d6, q10\t\n"
920 "vshl.u8 d4, d4, #3\t\n"
921 "vshl.u8 d5, d5, #2\t\n"
922 "vshl.u8 d6, d6, #3\t\n"
923 "vsri.u8 d4, d4, #5\t\n"
924 "vsri.u8 d5, d5, #6\t\n"
925 "vsri.u8 d6, d6, #5\t\n"
927 "vmull.u8 q10, d31, d0\n\t"
928 "vmull.u8 q11, d31, d1\n\t"
929 "vmull.u8 q12, d31, d2\n\t"
930 "vmull.u8 q13, d31, d3\n\t"
931 "vrshr.u16 q8, q10, #8\n\t"
932 "vrshr.u16 q9, q11, #8\n\t"
933 "vraddhn.u16 d20, q10, q8\n\t"
934 "vraddhn.u16 d21, q11, q9\n\t"
935 "vrshr.u16 q9, q13, #8\n\t"
936 "vrshr.u16 q8, q12, #8\n\t"
937 "vraddhn.u16 d23, q13, q9\n\t"
938 "vraddhn.u16 d22, q12, q8\n\t"
940 /* duplicate in 4/2/1 & 8pix vsns */
941 "vmvn.8 d30, d23\n\t"
942 "vmull.u8 q14, d30, d6\n\t"
943 "vmull.u8 q13, d30, d5\n\t"
944 "vmull.u8 q12, d30, d4\n\t"
945 "vrshr.u16 q8, q14, #8\n\t"
946 "vrshr.u16 q9, q13, #8\n\t"
947 "vraddhn.u16 d6, q14, q8\n\t"
948 "vrshr.u16 q8, q12, #8\n\t"
949 "vraddhn.u16 d5, q13, q9\n\t"
950 "vqadd.u8 d6, d6, d22\n\t" /* moved up */
951 "vraddhn.u16 d4, q12, q8\n\t"
952 /* intentionally don't calculate alpha */
953 /* result in d4-d6 */
955 /* "vqadd.u8 d6, d6, d22\n\t" ** moved up */
956 "vqadd.u8 d5, d5, d21\n\t"
957 "vqadd.u8 d4, d4, d20\n\t"
959 /* pack 8888 {d20-d23} to 0565 q10 */
960 "vshll.u8 q10, d6, #8\n\t"
961 "vshll.u8 q3, d5, #8\n\t"
962 "vshll.u8 q2, d4, #8\n\t"
963 "vsri.u16 q10, q3, #5\t\n"
964 "vsri.u16 q10, q2, #11\t\n"
967 "beq skip_store1\t\n"
968 "vst1.16 {d20[1]}, [%[dst]]\t\n"
971 "beq skip_store2\t\n"
972 "vst1.32 {d20[1]}, [%[dst2]]\t\n"
975 "beq skip_store4\t\n"
976 "vst1.16 {d21}, [%[dst4]]\t\n"
979 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [dst4] "+r" (dst4), [dst2] "+r" (dst2)
981 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
982 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
991 neon_composite_over_n_8_8888 (pixman_implementation_t * impl,
993 pixman_image_t * src_image,
994 pixman_image_t * mask_image,
995 pixman_image_t * dst_image,
1006 uint32_t *dst_line, *dst;
1007 uint8_t *mask_line, *mask;
1008 int dst_stride, mask_stride;
1012 uint8x8_t mask_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0101010100000000ULL));
1013 uint8x8_t alpha_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0707070703030303ULL));
1015 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1017 /* bail out if fully transparent */
1022 sval2 = vreinterpret_u8_u32 (vdup_n_u32 (src));
1023 sval8.val[0] = vdup_lane_u8 (sval2, 0);
1024 sval8.val[1] = vdup_lane_u8 (sval2, 1);
1025 sval8.val[2] = vdup_lane_u8 (sval2, 2);
1026 sval8.val[3] = vdup_lane_u8 (sval2, 3);
1028 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1029 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1033 /* Use overlapping 8-pixel method, modified to avoid
1034 * rewritten dest being reused
1038 uint32_t *keep_dst = 0;
1041 dst_line += dst_stride;
1043 mask_line += mask_stride;
1046 #ifndef USE_GCC_INLINE_ASM
1048 uint8x8x4_t dval, temp;
1050 alpha = vld1_u8 ((void *)mask);
1051 dval = vld4_u8 ((void *)dst);
1054 temp = neon8mul (sval8, alpha);
1055 dval = neon8mul (dval, vmvn_u8 (temp.val[3]));
1056 temp = neon8qadd (temp, dval);
1064 alpha = vld1_u8 ((void *)mask);
1065 dval = vld4_u8 ((void *)dst);
1067 vst4_u8 ((void *)keep_dst, temp);
1070 temp = neon8mul (sval8, alpha);
1071 dval = neon8mul (dval, vmvn_u8 (temp.val[3]));
1072 temp = neon8qadd (temp, dval);
1078 vst4_u8 ((void *)keep_dst, temp);
1081 "vdup.32 d0, %[src]\n\t"
1082 "vdup.8 d1, d0[1]\n\t"
1083 "vdup.8 d2, d0[2]\n\t"
1084 "vdup.8 d3, d0[3]\n\t"
1085 "vdup.8 d0, d0[0]\n\t"
1087 "vld4.8 {d4-d7}, [%[dst]]\n\t"
1088 "vld1.8 {d31}, [%[mask]]\n\t"
1089 "mov %[keep_dst], %[dst]\n\t"
1091 "and ip, %[w], #7\n\t"
1092 "add %[mask], %[mask], ip\n\t"
1093 "add %[dst], %[dst], ip, LSL#2\n\t"
1094 "subs %[w], %[w], ip\n\t"
1098 "vld4.8 {d4-d7}, [%[dst]]!\n\t"
1099 "vld1.8 {d31}, [%[mask]]!\n\t"
1100 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
1101 "sub %[keep_dst], %[dst], #8*4\n\t"
1102 "subs %[w], %[w], #8\n\t"
1105 "vmull.u8 q10, d31, d0\n\t"
1106 "vmull.u8 q11, d31, d1\n\t"
1107 "vmull.u8 q12, d31, d2\n\t"
1108 "vmull.u8 q13, d31, d3\n\t"
1109 "vrshr.u16 q8, q10, #8\n\t"
1110 "vrshr.u16 q9, q11, #8\n\t"
1111 "vraddhn.u16 d20, q10, q8\n\t"
1112 "vraddhn.u16 d21, q11, q9\n\t"
1113 "vrshr.u16 q9, q13, #8\n\t"
1114 "vrshr.u16 q8, q12, #8\n\t"
1115 "vraddhn.u16 d23, q13, q9\n\t"
1116 "vraddhn.u16 d22, q12, q8\n\t"
1118 "vmvn.8 d30, d23\n\t"
1119 "vmull.u8 q12, d30, d4\n\t"
1120 "vmull.u8 q13, d30, d5\n\t"
1121 "vmull.u8 q14, d30, d6\n\t"
1122 "vmull.u8 q15, d30, d7\n\t"
1124 "vrshr.u16 q8, q12, #8\n\t"
1125 "vrshr.u16 q9, q13, #8\n\t"
1126 "vraddhn.u16 d4, q12, q8\n\t"
1127 "vrshr.u16 q8, q14, #8\n\t"
1128 "vraddhn.u16 d5, q13, q9\n\t"
1129 "vrshr.u16 q9, q15, #8\n\t"
1130 "vraddhn.u16 d6, q14, q8\n\t"
1131 "vraddhn.u16 d7, q15, q9\n\t"
1132 /* result in d4-d7 */
1134 "vqadd.u8 d20, d4, d20\n\t"
1135 "vqadd.u8 d21, d5, d21\n\t"
1136 "vqadd.u8 d22, d6, d22\n\t"
1137 "vqadd.u8 d23, d7, d23\n\t"
1142 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
1144 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
1146 : "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
1147 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
1160 dst_line += dst_stride;
1162 mask_line += mask_stride;
1167 uint8x8_t dval, temp, res;
1170 vreinterpret_u8_u16 (vld1_dup_u16 ((void *)mask)), mask_selector);
1171 dval = vld1_u8 ((void *)dst);
1173 temp = neon2mul (sval2, alpha);
1175 temp, neon2mul (dval, vtbl1_u8 (vmvn_u8 (temp), alpha_selector)));
1177 vst1_u8 ((void *)dst, res);
1186 uint8x8_t dval, temp, res;
1188 alpha = vtbl1_u8 (vld1_dup_u8 ((void *)mask), mask_selector);
1189 dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)dst));
1191 temp = neon2mul (sval2, alpha);
1193 temp, neon2mul (dval, vtbl1_u8 (vmvn_u8 (temp), alpha_selector)));
1195 vst1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (res), 0);
1202 neon_composite_add_n_8_8 (pixman_implementation_t * impl,
1204 pixman_image_t * src_image,
1205 pixman_image_t * mask_image,
1206 pixman_image_t * dst_image,
1216 uint8_t *dst_line, *dst;
1217 uint8_t *mask_line, *mask;
1218 int dst_stride, mask_stride;
1223 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
1224 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1225 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1226 sa = vdup_n_u8 ((src) >> 24);
1230 /* Use overlapping 8-pixel method, modified to avoid rewritten dest being reused */
1234 dst_line += dst_stride;
1236 mask_line += mask_stride;
1239 uint8x8_t mval, dval, res;
1242 mval = vld1_u8 ((void *)mask);
1243 dval = vld1_u8 ((void *)dst);
1246 res = vqadd_u8 (neon2mul (mval, sa), dval);
1254 mval = vld1_u8 ((void *)mask);
1255 dval = vld1_u8 ((void *)dst);
1256 vst1_u8 ((void *)keep_dst, res);
1259 res = vqadd_u8 (neon2mul (mval, sa), dval);
1265 vst1_u8 ((void *)keep_dst, res);
1270 /* Use 4/2/1 load/store method to handle 1-7 pixels */
1274 dst_line += dst_stride;
1276 mask_line += mask_stride;
1279 uint8x8_t mval = sa, dval = sa, res;
1280 uint8_t *dst4 = 0, *dst2 = 0;
1284 mval = vreinterpret_u8_u32 (
1285 vld1_lane_u32 ((void *)mask, vreinterpret_u32_u8 (mval), 1));
1286 dval = vreinterpret_u8_u32 (
1287 vld1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (dval), 1));
1296 mval = vreinterpret_u8_u16 (
1297 vld1_lane_u16 ((void *)mask, vreinterpret_u16_u8 (mval), 1));
1298 dval = vreinterpret_u8_u16 (
1299 vld1_lane_u16 ((void *)dst, vreinterpret_u16_u8 (dval), 1));
1307 mval = vld1_lane_u8 (mask, mval, 1);
1308 dval = vld1_lane_u8 (dst, dval, 1);
1311 res = vqadd_u8 (neon2mul (mval, sa), dval);
1314 vst1_lane_u8 (dst, res, 1);
1316 vst1_lane_u16 ((void *)dst2, vreinterpret_u16_u8 (res), 1);
1318 vst1_lane_u32 ((void *)dst4, vreinterpret_u32_u8 (res), 1);
1323 #ifdef USE_GCC_INLINE_ASM
1326 neon_composite_src_16_16 (pixman_implementation_t * impl,
1328 pixman_image_t * src_image,
1329 pixman_image_t * mask_image,
1330 pixman_image_t * dst_image,
1340 uint16_t *dst_line, *src_line;
1341 uint32_t dst_stride, src_stride;
1343 if (!height || !width)
1346 /* We simply copy 16-bit-aligned pixels from one place to another. */
1347 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
1348 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1350 /* Preload the first input scanline */
1352 uint16_t *src_ptr = src_line;
1353 uint32_t count = width;
1357 " subs %[count], %[count], #32 \n"
1359 " add %[src], %[src], #64 \n"
1362 /* Clobbered input registers marked as input/outputs */
1363 : [src] "+r" (src_ptr), [count] "+r" (count)
1364 : /* no unclobbered inputs */
1371 uint16_t *dst_ptr = dst_line;
1372 uint16_t *src_ptr = src_line;
1373 uint32_t count = width;
1376 /* Uses multi-register access and preloading to maximise bandwidth.
1377 * Each pixel is one halfword, so a quadword contains 8px.
1378 * Preload frequency assumed a 64-byte cacheline.
1381 " cmp %[count], #64 \n"
1382 " blt 1f @ skip oversized fragments \n"
1383 "0: @ start with eight quadwords at a time \n"
1384 /* preload from next scanline */
1385 " pld [%[src], %[src_stride], LSL #1] \n"
1386 " sub %[count], %[count], #64 \n"
1387 " vld1.16 {d16, d17, d18, d19}, [%[src]]! \n"
1388 " vld1.16 {d20, d21, d22, d23}, [%[src]]! \n"
1389 /* preload from next scanline */
1390 " pld [%[src], %[src_stride], LSL #1] \n"
1391 " vld1.16 {d24, d25, d26, d27}, [%[src]]! \n"
1392 " vld1.16 {d28, d29, d30, d31}, [%[src]]! \n"
1393 " cmp %[count], #64 \n"
1394 " vst1.16 {d16, d17, d18, d19}, [%[dst]]! \n"
1395 " vst1.16 {d20, d21, d22, d23}, [%[dst]]! \n"
1396 " vst1.16 {d24, d25, d26, d27}, [%[dst]]! \n"
1397 " vst1.16 {d28, d29, d30, d31}, [%[dst]]! \n"
1399 " cmp %[count], #0 \n"
1400 " beq 7f @ aligned fastpath \n"
1401 "1: @ four quadwords \n"
1402 " tst %[count], #32 \n"
1403 " beq 2f @ skip oversized fragment \n"
1404 /* preload from next scanline */
1405 " pld [%[src], %[src_stride], LSL #1] \n"
1406 " vld1.16 {d16, d17, d18, d19}, [%[src]]! \n"
1407 " vld1.16 {d20, d21, d22, d23}, [%[src]]! \n"
1408 " vst1.16 {d16, d17, d18, d19}, [%[dst]]! \n"
1409 " vst1.16 {d20, d21, d22, d23}, [%[dst]]! \n"
1410 "2: @ two quadwords \n"
1411 " tst %[count], #16 \n"
1412 " beq 3f @ skip oversized fragment \n"
1413 /* preload from next scanline */
1414 " pld [%[src], %[src_stride], LSL #1] \n"
1415 " vld1.16 {d16, d17, d18, d19}, [%[src]]! \n"
1416 " vst1.16 {d16, d17, d18, d19}, [%[dst]]! \n"
1417 "3: @ one quadword \n"
1418 " tst %[count], #8 \n"
1419 " beq 4f @ skip oversized fragment \n"
1420 " vld1.16 {d16, d17}, [%[src]]! \n"
1421 " vst1.16 {d16, d17}, [%[dst]]! \n"
1422 "4: @ one doubleword \n"
1423 " tst %[count], #4 \n"
1424 " beq 5f @ skip oversized fragment \n"
1425 " vld1.16 {d16}, [%[src]]! \n"
1426 " vst1.16 {d16}, [%[dst]]! \n"
1428 " tst %[count], #2 \n"
1429 " beq 6f @ skip oversized fragment \n"
1430 " ldr %[tmp], [%[src]], #4 \n"
1431 " str %[tmp], [%[dst]], #4 \n"
1432 "6: @ one halfword \n"
1433 " tst %[count], #1 \n"
1434 " beq 7f @ skip oversized fragment \n"
1435 " ldrh %[tmp], [%[src]] \n"
1436 " strh %[tmp], [%[dst]] \n"
1439 /* Clobbered input registers marked as input/outputs */
1440 : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr),
1441 [count] "+r" (count), [tmp] "+r" (tmp)
1443 /* Unclobbered input */
1444 : [src_stride] "r" (src_stride)
1446 /* Clobbered vector registers */
1447 : "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
1448 "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31", "cc", "memory"
1451 src_line += src_stride;
1452 dst_line += dst_stride;
1456 #endif /* USE_GCC_INLINE_ASM */
1459 neon_composite_src_24_16 (pixman_implementation_t * impl,
1461 pixman_image_t * src_image,
1462 pixman_image_t * mask_image,
1463 pixman_image_t * dst_image,
1475 uint32_t dst_stride, src_stride;
1477 if (!width || !height)
1480 /* We simply copy pixels from one place to another,
1481 * assuming that the source's alpha is opaque.
1483 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1484 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1486 /* Preload the first input scanline */
1488 uint8_t *src_ptr = (uint8_t*) src_line;
1489 uint32_t count = (width + 15) / 16;
1491 #ifdef USE_GCC_INLINE_ASM
1494 " subs %[count], %[count], #1 \n"
1496 " add %[src], %[src], #64 \n"
1499 /* Clobbered input registers marked as input/outputs */
1500 : [src] "+r" (src_ptr), [count] "+r" (count)
1501 : /* no unclobbered inputs */
1516 uint16_t *dst_ptr = dst_line;
1517 uint32_t *src_ptr = src_line;
1518 uint32_t count = width;
1519 const uint32_t rb_mask = 0x1F;
1520 const uint32_t g_mask = 0x3F;
1522 /* If you're going to complain about a goto, take a long hard look
1523 * at the massive blocks of assembler this skips over. ;-)
1528 #ifdef USE_GCC_INLINE_ASM
1530 /* This is not as aggressive as the RGB565-source case.
1531 * Generally the source is in cached RAM when the formats are
1532 * different, so we use preload.
1534 * We don't need to blend, so we are not reading from the
1535 * uncached framebuffer.
1538 " cmp %[count], #16 \n"
1539 " blt 1f @ skip oversized fragments \n"
1540 "0: @ start with sixteen pixels at a time \n"
1541 " sub %[count], %[count], #16 \n"
1542 " pld [%[src], %[src_stride], lsl #2] @ preload from next scanline \n"
1543 " vld4.8 {d0, d1, d2, d3}, [%[src]]! @ d3 is alpha and ignored, d2-0 are rgb. \n"
1544 " vld4.8 {d4, d5, d6, d7}, [%[src]]! @ d7 is alpha and ignored, d6-4 are rgb. \n"
1545 " vshll.u8 q8, d2, #8 @ expand first red for repacking \n"
1546 " vshll.u8 q10, d1, #8 @ expand first green for repacking \n"
1547 " vshll.u8 q11, d0, #8 @ expand first blue for repacking \n"
1548 " vshll.u8 q9, d6, #8 @ expand second red for repacking \n"
1549 " vsri.u16 q8, q10, #5 @ insert first green after red \n"
1550 " vshll.u8 q10, d5, #8 @ expand second green for repacking \n"
1551 " vsri.u16 q8, q11, #11 @ insert first blue after green \n"
1552 " vshll.u8 q11, d4, #8 @ expand second blue for repacking \n"
1553 " vsri.u16 q9, q10, #5 @ insert second green after red \n"
1554 " vsri.u16 q9, q11, #11 @ insert second blue after green \n"
1555 " cmp %[count], #16 \n"
1556 " vst1.16 {d16, d17, d18, d19}, [%[dst]]! @ store 16 pixels \n"
1558 "1: @ end of main loop \n"
1559 " cmp %[count], #8 @ can we still do an 8-pixel block? \n"
1561 " sub %[count], %[count], #8 \n"
1562 " pld [%[src], %[src_stride], lsl #2] @ preload from next scanline \n"
1563 " vld4.8 {d0, d1, d2, d3}, [%[src]]! @ d3 is alpha and ignored, d2-0 are rgb. \n"
1564 " vshll.u8 q8, d2, #8 @ expand first red for repacking \n"
1565 " vshll.u8 q10, d1, #8 @ expand first green for repacking \n"
1566 " vshll.u8 q11, d0, #8 @ expand first blue for repacking \n"
1567 " vsri.u16 q8, q10, #5 @ insert first green after red \n"
1568 " vsri.u16 q8, q11, #11 @ insert first blue after green \n"
1569 " vst1.16 {d16, d17}, [%[dst]]! @ store 8 pixels \n"
1572 /* Clobbered input and working registers marked as input/outputs */
1573 : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr), [count] "+r" (count)
1575 /* Unclobbered input */
1576 : [src_stride] "r" (src_stride)
1578 /* Clobbered vector registers */
1580 /* NB: these are the quad aliases of the
1581 * double registers used in the asm
1583 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17",
1584 "d18", "d19", "d20", "d21", "d22", "d23", "cc", "memory"
1587 /* A copy of the above code, in intrinsics-form. */
1590 uint8x8x4_t pixel_set_a, pixel_set_b;
1591 uint16x8_t red_a, green_a, blue_a;
1592 uint16x8_t red_b, green_b, blue_b;
1593 uint16x8_t dest_pixels_a, dest_pixels_b;
1596 __pld (src_ptr + src_stride);
1597 pixel_set_a = vld4_u8 ((uint8_t*)(src_ptr));
1598 pixel_set_b = vld4_u8 ((uint8_t*)(src_ptr + 8));
1601 red_a = vshll_n_u8 (pixel_set_a.val[2], 8);
1602 green_a = vshll_n_u8 (pixel_set_a.val[1], 8);
1603 blue_a = vshll_n_u8 (pixel_set_a.val[0], 8);
1605 red_b = vshll_n_u8 (pixel_set_b.val[2], 8);
1606 green_b = vshll_n_u8 (pixel_set_b.val[1], 8);
1607 blue_b = vshll_n_u8 (pixel_set_b.val[0], 8);
1609 dest_pixels_a = vsriq_n_u16 (red_a, green_a, 5);
1610 dest_pixels_b = vsriq_n_u16 (red_b, green_b, 5);
1612 dest_pixels_a = vsriq_n_u16 (dest_pixels_a, blue_a, 11);
1613 dest_pixels_b = vsriq_n_u16 (dest_pixels_b, blue_b, 11);
1615 /* There doesn't seem to be an intrinsic for the
1616 * double-quadword variant
1618 vst1q_u16 (dst_ptr, dest_pixels_a);
1619 vst1q_u16 (dst_ptr + 8, dest_pixels_b);
1626 uint8x8x4_t pixel_set_a;
1627 uint16x8_t red_a, green_a, blue_a;
1628 uint16x8_t dest_pixels_a;
1630 __pld (src_ptr + src_stride);
1632 pixel_set_a = vld4_u8 ((uint8_t*)(src_ptr));
1635 red_a = vshll_n_u8 (pixel_set_a.val[2], 8);
1636 green_a = vshll_n_u8 (pixel_set_a.val[1], 8);
1637 blue_a = vshll_n_u8 (pixel_set_a.val[0], 8);
1639 dest_pixels_a = vsriq_n_u16 (red_a, green_a, 5);
1640 dest_pixels_a = vsriq_n_u16 (dest_pixels_a, blue_a, 11);
1642 vst1q_u16 (dst_ptr, dest_pixels_a);
1646 #endif /* USE_GCC_INLINE_ASM */
1650 __pld (src_ptr + src_stride);
1654 uint32_t src_pixel_a = *src_ptr++;
1655 uint32_t src_pixel_b = *src_ptr++;
1657 /* ARM is really good at shift-then-ALU ops. */
1658 /* This should be a total of six shift-ANDs and five shift-ORs. */
1659 uint32_t dst_pixels_a;
1660 uint32_t dst_pixels_b;
1662 dst_pixels_a = ((src_pixel_a >> 3) & rb_mask);
1663 dst_pixels_a |= ((src_pixel_a >> 10) & g_mask) << 5;
1664 dst_pixels_a |= ((src_pixel_a >> 19) & rb_mask) << 11;
1666 dst_pixels_b = ((src_pixel_b >> 3) & rb_mask);
1667 dst_pixels_b |= ((src_pixel_b >> 10) & g_mask) << 5;
1668 dst_pixels_b |= ((src_pixel_b >> 19) & rb_mask) << 11;
1670 /* little-endian mode only */
1671 *((uint32_t*) dst_ptr) = dst_pixels_a | (dst_pixels_b << 16);
1678 uint32_t src_pixel = *src_ptr++;
1680 /* ARM is really good at shift-then-ALU ops.
1681 * This block should end up as three shift-ANDs
1682 * and two shift-ORs.
1684 uint32_t tmp_blue = (src_pixel >> 3) & rb_mask;
1685 uint32_t tmp_green = (src_pixel >> 10) & g_mask;
1686 uint32_t tmp_red = (src_pixel >> 19) & rb_mask;
1687 uint16_t dst_pixel = (tmp_red << 11) | (tmp_green << 5) | tmp_blue;
1689 *dst_ptr++ = dst_pixel;
1693 src_line += src_stride;
1694 dst_line += dst_stride;
1698 static pixman_bool_t
1699 pixman_fill_neon (uint32_t *bits,
1708 uint32_t byte_stride, color;
1711 /* stride is always multiple of 32bit units in pixman */
1712 byte_stride = stride * sizeof(uint32_t);
1717 dst = ((char *) bits) + y * byte_stride + x;
1719 color = _xor << 24 | _xor << 16 | _xor << 8 | _xor;
1723 dst = ((char *) bits) + y * byte_stride + x * 2;
1725 color = _xor << 16 | _xor;
1726 width *= 2; /* width to bytes */
1730 dst = ((char *) bits) + y * byte_stride + x * 4;
1732 width *= 4; /* width to bytes */
1739 #ifdef USE_GCC_INLINE_ASM
1742 /* We have a special case for such small widths that don't allow
1743 * us to use wide 128-bit stores anyway. We don't waste time
1744 * trying to align writes, since there are only very few of them anyway
1747 "cmp %[height], #0\n"/* Check if empty fill */
1749 "vdup.32 d0, %[color]\n"/* Fill the color to neon req */
1751 /* Check if we have a such width that can easily be handled by single
1752 * operation for each scanline. This significantly reduces the number
1753 * of test/branch instructions for each scanline
1755 "cmp %[width], #8\n"
1757 "cmp %[width], #4\n"
1759 "cmp %[width], #2\n"
1762 /* Loop starts here for each scanline */
1764 "mov r4, %[dst]\n" /* Starting address of the current line */
1765 "tst %[width], #8\n"
1767 "vst1.8 {d0}, [r4]!\n"
1769 "tst %[width], #4\n"
1771 "str %[color], [r4], #4\n"
1773 "tst %[width], #2\n"
1775 "strh %[color], [r4], #2\n"
1777 "tst %[width], #1\n"
1779 "strb %[color], [r4], #1\n"
1782 "subs %[height], %[height], #1\n"
1783 "add %[dst], %[dst], %[byte_stride]\n"
1787 /* Special fillers for those widths that we can do with single operation */
1789 "subs %[height], %[height], #1\n"
1790 "vst1.8 {d0}, [%[dst]]\n"
1791 "add %[dst], %[dst], %[byte_stride]\n"
1796 "subs %[height], %[height], #1\n"
1797 "str %[color], [%[dst]]\n"
1798 "add %[dst], %[dst], %[byte_stride]\n"
1803 "subs %[height], %[height], #1\n"
1804 "strh %[color], [%[dst]]\n"
1805 "add %[dst], %[dst], %[byte_stride]\n"
1809 : [height] "+r" (height), [dst] "+r" (dst)
1810 : [color] "r" (color), [width] "r" (width),
1811 [byte_stride] "r" (byte_stride)
1812 : "memory", "cc", "d0", "r4");
1817 "cmp %[height], #0\n"/* Check if empty fill */
1819 "vdup.32 q0, %[color]\n"/* Fill the color to neon req */
1821 /* Loop starts here for each scanline */
1823 "mov r4, %[dst]\n"/* Starting address of the current line */
1824 "mov r5, %[width]\n"/* We're going to write this many bytes */
1825 "ands r6, r4, #15\n"/* Are we at the 128-bit aligned address? */
1826 "beq 2f\n"/* Jump to the best case */
1828 /* We're not 128-bit aligned: However, we know that we can get to the
1829 next aligned location, since the fill is at least 16 bytes wide */
1830 "rsb r6, r6, #16\n" /* We would need to go forward this much */
1831 "sub r5, r5, r6\n"/* Update bytes left */
1834 "vst1.8 {d0[0]}, [r4]!\n"/* Store byte, now we are word aligned */
1838 "vst1.16 {d0[0]}, [r4, :16]!\n"/* Store half word, now we are 16-bit aligned */
1842 "vst1.32 {d0[0]}, [r4, :32]!\n"/* Store word, now we're 32-bit aligned */
1846 "vst1.64 {d0}, [r4, :64]!\n"/* Store qword now we're 64-bit aligned */
1848 /* The good case: We're 128-bit aligned for this scanline */
1850 "and r6, r5, #15\n"/* Number of tailing bytes */
1851 "cmp r5, r6\n"/* Do we have at least one qword to write? */
1852 "beq 6f\n"/* No, we just write the tail */
1853 "lsr r5, r5, #4\n"/* This many full qwords to write */
1855 /* The main block: Do 128-bit aligned writes */
1858 "vst1.64 {d0, d1}, [r4, :128]!\n"
1861 /* Handle the tailing bytes: Do 64, 32, 16 and 8-bit aligned writes as needed.
1862 We know that we're currently at 128-bit aligned address, so we can just
1863 pick the biggest operations that the remaining write width allows */
1869 "vst1.64 {d0}, [r4, :64]!\n"
1873 "vst1.32 {d0[0]}, [r4, :32]!\n"
1877 "vst1.16 {d0[0]}, [r4, :16]!\n"
1881 "vst1.8 {d0[0]}, [r4]!\n"
1884 /* Handle the next scanline */
1885 "subs %[height], %[height], #1\n"
1886 "add %[dst], %[dst], %[byte_stride]\n"
1889 : [height] "+r" (height), [dst] "+r" (dst)
1890 : [color] "r" (color), [width] "r" (width),
1891 [byte_stride] "r" (byte_stride)
1892 : "memory", "cc", "d0", "d1", "r4", "r5", "r6");
1898 /* TODO: intrinsic version for armcc */
1904 static const pixman_fast_path_t arm_neon_fast_path_array[] =
1906 { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, neon_composite_add_n_8_8, 0 },
1907 { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, neon_composite_add_8000_8000, 0 },
1908 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, neon_composite_over_n_8_0565, 0 },
1909 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, neon_composite_over_n_8_0565, 0 },
1910 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_src_24_16, 0 },
1911 { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_src_24_16, 0 },
1912 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_src_24_16, 0 },
1913 { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_src_24_16, 0 },
1914 #ifdef USE_GCC_INLINE_ASM
1915 { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_src_16_16, 0 },
1916 { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_src_16_16, 0 },
1918 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, neon_composite_over_8888_8888, 0 },
1919 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, neon_composite_over_8888_8888, 0 },
1920 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, neon_composite_over_8888_8888, 0 },
1921 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, neon_composite_over_8888_8888, 0 },
1922 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
1923 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
1924 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, neon_composite_over_n_8_8888, 0 },
1925 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, neon_composite_over_n_8_8888, 0 },
1926 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, neon_composite_over_n_8_8888, 0 },
1927 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, neon_composite_over_n_8_8888, 0 },
1931 const pixman_fast_path_t *const arm_neon_fast_paths = arm_neon_fast_path_array;
1934 arm_neon_composite (pixman_implementation_t *imp,
1936 pixman_image_t * src,
1937 pixman_image_t * mask,
1938 pixman_image_t * dest,
1948 if (_pixman_run_fast_path (arm_neon_fast_paths, imp,
1949 op, src, mask, dest,
1958 _pixman_implementation_composite (imp->delegate, op,
1966 static pixman_bool_t
1967 arm_neon_fill (pixman_implementation_t *imp,
1977 if (pixman_fill_neon (bits, stride, bpp, x, y, width, height, xor))
1980 return _pixman_implementation_fill (
1981 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
1984 pixman_implementation_t *
1985 _pixman_implementation_create_arm_neon (void)
1987 pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
1988 pixman_implementation_t *imp = _pixman_implementation_create (general);
1990 imp->composite = arm_neon_composite;
1991 imp->fill = arm_neon_fill;