2 * Copyright © 2009 ARM Ltd, Movial Creative Technologies Oy
4 * Permission to use, copy, modify, distribute, and sell this software and its
5 * documentation for any purpose is hereby granted without fee, provided that
6 * the above copyright notice appear in all copies and that both that
7 * copyright notice and this permission notice appear in supporting
8 * documentation, and that the name of ARM Ltd not be used in
9 * advertising or publicity pertaining to distribution of the software without
10 * specific, written prior permission. ARM Ltd makes no
11 * representations about the suitability of this software for any purpose. It
12 * is provided "as is" without express or implied warranty.
14 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
15 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
16 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
17 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
19 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
20 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23 * Author: Ian Rickards (ian.rickards@arm.com)
24 * Author: Jonathan Morton (jonathan.morton@movial.com)
25 * Author: Markku Vire (markku.vire@movial.com)
35 #include "pixman-private.h"
37 // Deal with an intrinsic that is defined differently in GCC
38 #if !defined(__ARMCC_VERSION) && !defined(__pld)
39 #define __pld(_x) __builtin_prefetch(_x)
42 static force_inline uint8x8x4_t unpack0565(uint16x8_t rgb)
47 res.val[3] = vdup_n_u8(0);
48 gb = vshrq_n_u16(rgb, 5);
49 b = vshrq_n_u16(rgb, 5+6);
50 res.val[0] = vmovn_u16(rgb); // get low 5 bits
51 res.val[1] = vmovn_u16(gb); // get mid 6 bits
52 res.val[2] = vmovn_u16(b); // get top 5 bits
54 res.val[0] = vshl_n_u8(res.val[0], 3); // shift to top
55 res.val[1] = vshl_n_u8(res.val[1], 2); // shift to top
56 res.val[2] = vshl_n_u8(res.val[2], 3); // shift to top
58 res.val[0] = vsri_n_u8(res.val[0], res.val[0], 5);
59 res.val[1] = vsri_n_u8(res.val[1], res.val[1], 6);
60 res.val[2] = vsri_n_u8(res.val[2], res.val[2], 5);
65 static force_inline uint16x8_t pack0565(uint8x8x4_t s)
67 uint16x8_t rgb, val_g, val_r;
69 rgb = vshll_n_u8(s.val[2],8);
70 val_g = vshll_n_u8(s.val[1],8);
71 val_r = vshll_n_u8(s.val[0],8);
72 rgb = vsriq_n_u16(rgb, val_g, 5);
73 rgb = vsriq_n_u16(rgb, val_r, 5+6);
78 static force_inline uint8x8_t neon2mul(uint8x8_t x, uint8x8_t alpha)
83 tmp = vmull_u8(x,alpha);
84 tmp2 = vrshrq_n_u16(tmp,8);
85 res = vraddhn_u16(tmp,tmp2);
90 static force_inline uint8x8x4_t neon8mul(uint8x8x4_t x, uint8x8_t alpha)
94 uint16x8_t qtmp1,qtmp2;
96 tmp.val[0] = vmull_u8(x.val[0],alpha);
97 tmp.val[1] = vmull_u8(x.val[1],alpha);
98 tmp.val[2] = vmull_u8(x.val[2],alpha);
99 tmp.val[3] = vmull_u8(x.val[3],alpha);
101 qtmp1 = vrshrq_n_u16(tmp.val[0],8);
102 qtmp2 = vrshrq_n_u16(tmp.val[1],8);
103 res.val[0] = vraddhn_u16(tmp.val[0],qtmp1);
104 qtmp1 = vrshrq_n_u16(tmp.val[2],8);
105 res.val[1] = vraddhn_u16(tmp.val[1],qtmp2);
106 qtmp2 = vrshrq_n_u16(tmp.val[3],8);
107 res.val[2] = vraddhn_u16(tmp.val[2],qtmp1);
108 res.val[3] = vraddhn_u16(tmp.val[3],qtmp2);
113 static force_inline uint8x8x4_t neon8qadd(uint8x8x4_t x, uint8x8x4_t y)
117 res.val[0] = vqadd_u8(x.val[0],y.val[0]);
118 res.val[1] = vqadd_u8(x.val[1],y.val[1]);
119 res.val[2] = vqadd_u8(x.val[2],y.val[2]);
120 res.val[3] = vqadd_u8(x.val[3],y.val[3]);
127 neon_CompositeAdd_8000_8000 (
128 pixman_implementation_t * impl,
130 pixman_image_t * src_image,
131 pixman_image_t * mask_image,
132 pixman_image_t * dst_image,
142 uint8_t *dstLine, *dst;
143 uint8_t *srcLine, *src;
144 int dstStride, srcStride;
147 fbComposeGetStart (src_image, src_x, src_y, uint8_t, srcStride, srcLine, 1);
148 fbComposeGetStart (dst_image, dest_x, dest_y, uint8_t, dstStride, dstLine, 1);
152 // Use overlapping 8-pixel method
156 dstLine += dstStride;
158 srcLine += srcStride;
163 #ifndef USE_GCC_INLINE_ASM
164 uint8x8_t sval,dval,temp;
166 sval = vld1_u8((void*)src);
167 dval = vld1_u8((void*)dst);
170 temp = vqadd_u8(dval,sval);
178 sval = vld1_u8((void*)src);
179 dval = vld1_u8((void*)dst);
181 vst1_u8((void*)keep_dst,temp);
184 temp = vqadd_u8(dval,sval);
190 vst1_u8((void*)keep_dst,temp);
193 // avoid using d8-d15 (q4-q7) aapcs callee-save registers
194 "vld1.8 {d0}, [%[src]]\n\t"
195 "vld1.8 {d4}, [%[dst]]\n\t"
196 "mov %[keep_dst], %[dst]\n\t"
198 "and ip, %[w], #7\n\t"
199 "add %[src], %[src], ip\n\t"
200 "add %[dst], %[dst], ip\n\t"
201 "subs %[w], %[w], ip\n\t"
205 "vld1.8 {d0}, [%[src]]!\n\t"
206 "vld1.8 {d4}, [%[dst]]!\n\t"
207 "vst1.8 {d20}, [%[keep_dst]]\n\t"
208 "sub %[keep_dst], %[dst], #8\n\t"
209 "subs %[w], %[w], #8\n\t"
211 "vqadd.u8 d20, d0, d4\n\t"
216 "vst1.8 {d20}, [%[keep_dst]]\n\t"
218 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
220 : "ip", "cc", "memory", "d0","d4",
228 const uint8_t nil = 0;
229 const uint8x8_t vnil = vld1_dup_u8(&nil);
234 dstLine += dstStride;
236 srcLine += srcStride;
238 uint8x8_t sval=vnil, dval=vnil;
239 uint8_t *dst4=0, *dst2=0;
243 sval = vreinterpret_u8_u32(vld1_lane_u32((void*)src,vreinterpret_u32_u8(sval),1));
244 dval = vreinterpret_u8_u32(vld1_lane_u32((void*)dst,vreinterpret_u32_u8(dval),1));
251 sval = vreinterpret_u8_u16(vld1_lane_u16((void*)src,vreinterpret_u16_u8(sval),1));
252 dval = vreinterpret_u8_u16(vld1_lane_u16((void*)dst,vreinterpret_u16_u8(dval),1));
259 sval = vld1_lane_u8(src,sval,1);
260 dval = vld1_lane_u8(dst,dval,1);
263 dval = vqadd_u8(dval,sval);
266 vst1_lane_u8(dst,dval,1);
268 vst1_lane_u16((void*)dst2,vreinterpret_u16_u8(dval),1);
270 vst1_lane_u32((void*)dst4,vreinterpret_u32_u8(dval),1);
277 neon_composite_over_8888_8888 (
278 pixman_implementation_t * impl,
280 pixman_image_t * src_image,
281 pixman_image_t * mask_image,
282 pixman_image_t * dst_image,
292 uint32_t *dstLine, *dst;
293 uint32_t *srcLine, *src;
294 int dstStride, srcStride;
297 fbComposeGetStart (dst_image, dest_x, dest_y, uint32_t, dstStride, dstLine, 1);
298 fbComposeGetStart (src_image, src_x, src_y, uint32_t, srcStride, srcLine, 1);
302 // Use overlapping 8-pixel method
306 dstLine += dstStride;
308 srcLine += srcStride;
311 uint32_t *keep_dst=0;
313 #ifndef USE_GCC_INLINE_ASM
314 uint8x8x4_t sval,dval,temp;
316 sval = vld4_u8((void*)src);
317 dval = vld4_u8((void*)dst);
320 temp = neon8mul(dval,vmvn_u8(sval.val[3]));
321 temp = neon8qadd(sval,temp);
329 sval = vld4_u8((void*)src);
330 dval = vld4_u8((void*)dst);
332 vst4_u8((void*)keep_dst,temp);
335 temp = neon8mul(dval,vmvn_u8(sval.val[3]));
336 temp = neon8qadd(sval,temp);
342 vst4_u8((void*)keep_dst,temp);
345 // avoid using d8-d15 (q4-q7) aapcs callee-save registers
346 "vld4.8 {d0-d3}, [%[src]]\n\t"
347 "vld4.8 {d4-d7}, [%[dst]]\n\t"
348 "mov %[keep_dst], %[dst]\n\t"
350 "and ip, %[w], #7\n\t"
351 "add %[src], %[src], ip, LSL#2\n\t"
352 "add %[dst], %[dst], ip, LSL#2\n\t"
353 "subs %[w], %[w], ip\n\t"
357 "vld4.8 {d0-d3}, [%[src]]!\n\t"
358 "vld4.8 {d4-d7}, [%[dst]]!\n\t"
359 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
360 "sub %[keep_dst], %[dst], #8*4\n\t"
361 "subs %[w], %[w], #8\n\t"
364 "vmull.u8 q10, d31, d4\n\t"
365 "vmull.u8 q11, d31, d5\n\t"
366 "vmull.u8 q12, d31, d6\n\t"
367 "vmull.u8 q13, d31, d7\n\t"
368 "vrshr.u16 q8, q10, #8\n\t"
369 "vrshr.u16 q9, q11, #8\n\t"
370 "vraddhn.u16 d20, q10, q8\n\t"
371 "vraddhn.u16 d21, q11, q9\n\t"
372 "vrshr.u16 q8, q12, #8\n\t"
373 "vrshr.u16 q9, q13, #8\n\t"
374 "vraddhn.u16 d22, q12, q8\n\t"
375 "vraddhn.u16 d23, q13, q9\n\t"
377 "vqadd.u8 d20, d0, d20\n\t"
378 "vqadd.u8 d21, d1, d21\n\t"
379 "vqadd.u8 d22, d2, d22\n\t"
380 "vqadd.u8 d23, d3, d23\n\t"
385 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
387 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
389 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
390 "d16","d17","d18","d19","d20","d21","d22","d23"
397 uint8x8_t alpha_selector=vreinterpret_u8_u64(vcreate_u64(0x0707070703030303ULL));
403 dstLine += dstStride;
405 srcLine += srcStride;
412 /* two 32-bit pixels packed into D-reg; ad-hoc vectorization */
413 sval = vreinterpret_u8_u32(vld1_u32((void*)src));
414 dval = vreinterpret_u8_u32(vld1_u32((void*)dst));
415 dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval),alpha_selector));
416 vst1_u8((void*)dst,vqadd_u8(sval,dval));
427 /* single 32-bit pixel in lane 0 */
428 sval = vreinterpret_u8_u32(vld1_dup_u32((void*)src)); // only interested in lane 0
429 dval = vreinterpret_u8_u32(vld1_dup_u32((void*)dst)); // only interested in lane 0
430 dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval),alpha_selector));
431 vst1_lane_u32((void*)dst,vreinterpret_u32_u8(vqadd_u8(sval,dval)),0);
438 neon_composite_over_8888_n_8888 (
439 pixman_implementation_t * impl,
441 pixman_image_t * src_image,
442 pixman_image_t * mask_image,
443 pixman_image_t * dst_image,
453 uint32_t *dstLine, *dst;
454 uint32_t *srcLine, *src;
456 int dstStride, srcStride;
458 uint8x8_t mask_alpha;
460 fbComposeGetStart (dst_image, dest_x, dest_y, uint32_t, dstStride, dstLine, 1);
461 fbComposeGetStart (src_image, src_x, src_y, uint32_t, srcStride, srcLine, 1);
463 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
464 mask_alpha = vdup_n_u8((mask) >> 24);
468 // Use overlapping 8-pixel method
472 dstLine += dstStride;
474 srcLine += srcStride;
477 uint32_t *keep_dst=0;
479 #ifndef USE_GCC_INLINE_ASM
480 uint8x8x4_t sval,dval,temp;
482 sval = vld4_u8((void*)src);
483 dval = vld4_u8((void*)dst);
486 sval = neon8mul(sval,mask_alpha);
487 temp = neon8mul(dval,vmvn_u8(sval.val[3]));
488 temp = neon8qadd(sval,temp);
496 sval = vld4_u8((void*)src);
497 dval = vld4_u8((void*)dst);
499 vst4_u8((void*)keep_dst,temp);
502 sval = neon8mul(sval,mask_alpha);
503 temp = neon8mul(dval,vmvn_u8(sval.val[3]));
504 temp = neon8qadd(sval,temp);
510 vst4_u8((void*)keep_dst,temp);
513 // avoid using d8-d15 (q4-q7) aapcs callee-save registers
514 "vdup.32 d30, %[mask]\n\t"
515 "vdup.8 d30, d30[3]\n\t"
517 "vld4.8 {d0-d3}, [%[src]]\n\t"
518 "vld4.8 {d4-d7}, [%[dst]]\n\t"
519 "mov %[keep_dst], %[dst]\n\t"
521 "and ip, %[w], #7\n\t"
522 "add %[src], %[src], ip, LSL#2\n\t"
523 "add %[dst], %[dst], ip, LSL#2\n\t"
524 "subs %[w], %[w], ip\n\t"
528 "vld4.8 {d0-d3}, [%[src]]!\n\t"
529 "vld4.8 {d4-d7}, [%[dst]]!\n\t"
530 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
531 "sub %[keep_dst], %[dst], #8*4\n\t"
532 "subs %[w], %[w], #8\n\t"
535 "vmull.u8 q10, d30, d0\n\t"
536 "vmull.u8 q11, d30, d1\n\t"
537 "vmull.u8 q12, d30, d2\n\t"
538 "vmull.u8 q13, d30, d3\n\t"
539 "vrshr.u16 q8, q10, #8\n\t"
540 "vrshr.u16 q9, q11, #8\n\t"
541 "vraddhn.u16 d0, q10, q8\n\t"
542 "vraddhn.u16 d1, q11, q9\n\t"
543 "vrshr.u16 q9, q13, #8\n\t"
544 "vrshr.u16 q8, q12, #8\n\t"
545 "vraddhn.u16 d3, q13, q9\n\t"
546 "vraddhn.u16 d2, q12, q8\n\t"
549 "vmull.u8 q10, d31, d4\n\t"
550 "vmull.u8 q11, d31, d5\n\t"
551 "vmull.u8 q12, d31, d6\n\t"
552 "vmull.u8 q13, d31, d7\n\t"
553 "vrshr.u16 q8, q10, #8\n\t"
554 "vrshr.u16 q9, q11, #8\n\t"
555 "vraddhn.u16 d20, q10, q8\n\t"
556 "vrshr.u16 q8, q12, #8\n\t"
557 "vraddhn.u16 d21, q11, q9\n\t"
558 "vrshr.u16 q9, q13, #8\n\t"
559 "vraddhn.u16 d22, q12, q8\n\t"
560 "vraddhn.u16 d23, q13, q9\n\t"
562 "vqadd.u8 d20, d0, d20\n\t"
563 "vqadd.u8 d21, d1, d21\n\t"
564 "vqadd.u8 d22, d2, d22\n\t"
565 "vqadd.u8 d23, d3, d23\n\t"
570 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
572 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
574 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
575 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27",
583 uint8x8_t alpha_selector=vreinterpret_u8_u64(vcreate_u64(0x0707070703030303ULL));
589 dstLine += dstStride;
591 srcLine += srcStride;
598 sval = vreinterpret_u8_u32(vld1_u32((void*)src));
599 dval = vreinterpret_u8_u32(vld1_u32((void*)dst));
601 /* sval * const alpha_mul */
602 sval = neon2mul(sval,mask_alpha);
604 /* dval * 255-(src alpha) */
605 dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval), alpha_selector));
607 vst1_u8((void*)dst,vqadd_u8(sval,dval));
618 sval = vreinterpret_u8_u32(vld1_dup_u32((void*)src));
619 dval = vreinterpret_u8_u32(vld1_dup_u32((void*)dst));
621 /* sval * const alpha_mul */
622 sval = neon2mul(sval,mask_alpha);
624 /* dval * 255-(src alpha) */
625 dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval), alpha_selector));
627 vst1_lane_u32((void*)dst,vreinterpret_u32_u8(vqadd_u8(sval,dval)),0);
635 neon_CompositeOver_n_8_8888 (
636 pixman_implementation_t * impl,
638 pixman_image_t * src_image,
639 pixman_image_t * mask_image,
640 pixman_image_t * dst_image,
651 uint32_t *dstLine, *dst;
652 uint8_t *maskLine, *mask;
653 int dstStride, maskStride;
657 uint8x8_t mask_selector=vreinterpret_u8_u64(vcreate_u64(0x0101010100000000ULL));
658 uint8x8_t alpha_selector=vreinterpret_u8_u64(vcreate_u64(0x0707070703030303ULL));
660 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
662 // bail out if fully transparent
667 sval2=vreinterpret_u8_u32(vdup_n_u32(src));
668 sval8.val[0]=vdup_lane_u8(sval2,0);
669 sval8.val[1]=vdup_lane_u8(sval2,1);
670 sval8.val[2]=vdup_lane_u8(sval2,2);
671 sval8.val[3]=vdup_lane_u8(sval2,3);
673 fbComposeGetStart (dst_image, dest_x, dest_y, uint32_t, dstStride, dstLine, 1);
674 fbComposeGetStart (mask_image, mask_x, mask_y, uint8_t, maskStride, maskLine, 1);
678 // Use overlapping 8-pixel method, modified to avoid rewritten dest being reused
681 uint32_t *keep_dst=0;
684 dstLine += dstStride;
686 maskLine += maskStride;
689 #ifndef USE_GCC_INLINE_ASM
691 uint8x8x4_t dval, temp;
693 alpha = vld1_u8((void*)mask);
694 dval = vld4_u8((void*)dst);
697 temp = neon8mul(sval8,alpha);
698 dval = neon8mul(dval,vmvn_u8(temp.val[3]));
699 temp = neon8qadd(temp,dval);
707 alpha = vld1_u8((void*)mask);
708 dval = vld4_u8((void*)dst);
710 vst4_u8((void*)keep_dst,temp);
713 temp = neon8mul(sval8,alpha);
714 dval = neon8mul(dval,vmvn_u8(temp.val[3]));
715 temp = neon8qadd(temp,dval);
721 vst4_u8((void*)keep_dst,temp);
724 "vdup.32 d0, %[src]\n\t"
725 "vdup.8 d1, d0[1]\n\t"
726 "vdup.8 d2, d0[2]\n\t"
727 "vdup.8 d3, d0[3]\n\t"
728 "vdup.8 d0, d0[0]\n\t"
730 "vld4.8 {d4-d7}, [%[dst]]\n\t"
731 "vld1.8 {d31}, [%[mask]]\n\t"
732 "mov %[keep_dst], %[dst]\n\t"
734 "and ip, %[w], #7\n\t"
735 "add %[mask], %[mask], ip\n\t"
736 "add %[dst], %[dst], ip, LSL#2\n\t"
737 "subs %[w], %[w], ip\n\t"
741 "vld4.8 {d4-d7}, [%[dst]]!\n\t"
742 "vld1.8 {d31}, [%[mask]]!\n\t"
743 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
744 "sub %[keep_dst], %[dst], #8*4\n\t"
745 "subs %[w], %[w], #8\n\t"
748 "vmull.u8 q10, d31, d0\n\t"
749 "vmull.u8 q11, d31, d1\n\t"
750 "vmull.u8 q12, d31, d2\n\t"
751 "vmull.u8 q13, d31, d3\n\t"
752 "vrshr.u16 q8, q10, #8\n\t"
753 "vrshr.u16 q9, q11, #8\n\t"
754 "vraddhn.u16 d20, q10, q8\n\t"
755 "vraddhn.u16 d21, q11, q9\n\t"
756 "vrshr.u16 q9, q13, #8\n\t"
757 "vrshr.u16 q8, q12, #8\n\t"
758 "vraddhn.u16 d23, q13, q9\n\t"
759 "vraddhn.u16 d22, q12, q8\n\t"
761 "vmvn.8 d30, d23\n\t"
762 "vmull.u8 q12, d30, d4\n\t"
763 "vmull.u8 q13, d30, d5\n\t"
764 "vmull.u8 q14, d30, d6\n\t"
765 "vmull.u8 q15, d30, d7\n\t"
767 "vrshr.u16 q8, q12, #8\n\t"
768 "vrshr.u16 q9, q13, #8\n\t"
769 "vraddhn.u16 d4, q12, q8\n\t"
770 "vrshr.u16 q8, q14, #8\n\t"
771 "vraddhn.u16 d5, q13, q9\n\t"
772 "vrshr.u16 q9, q15, #8\n\t"
773 "vraddhn.u16 d6, q14, q8\n\t"
774 "vraddhn.u16 d7, q15, q9\n\t"
777 "vqadd.u8 d20, d4, d20\n\t"
778 "vqadd.u8 d21, d5, d21\n\t"
779 "vqadd.u8 d22, d6, d22\n\t"
780 "vqadd.u8 d23, d7, d23\n\t"
785 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
787 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
789 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
790 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
803 dstLine += dstStride;
805 maskLine += maskStride;
810 uint8x8_t dval, temp, res;
812 alpha = vtbl1_u8(vreinterpret_u8_u16(vld1_dup_u16((void*)mask)), mask_selector);
813 dval = vld1_u8((void*)dst);
815 temp = neon2mul(sval2,alpha);
816 res = vqadd_u8(temp,neon2mul(dval,vtbl1_u8(vmvn_u8(temp), alpha_selector)));
818 vst1_u8((void*)dst,res);
826 uint8x8_t dval, temp, res;
828 alpha = vtbl1_u8(vld1_dup_u8((void*)mask), mask_selector);
829 dval = vreinterpret_u8_u32(vld1_dup_u32((void*)dst));
831 temp = neon2mul(sval2,alpha);
832 res = vqadd_u8(temp,neon2mul(dval,vtbl1_u8(vmvn_u8(temp), alpha_selector)));
834 vst1_lane_u32((void*)dst,vreinterpret_u32_u8(res),0);
842 neon_CompositeAdd_8888_8_8 (
843 pixman_implementation_t * impl,
845 pixman_image_t * src_image,
846 pixman_image_t * mask_image,
847 pixman_image_t * dst_image,
857 uint8_t *dstLine, *dst;
858 uint8_t *maskLine, *mask;
859 int dstStride, maskStride;
864 fbComposeGetStart (dst_image, dest_x, dest_y, uint8_t, dstStride, dstLine, 1);
865 fbComposeGetStart (mask_image, mask_x, mask_y, uint8_t, maskStride, maskLine, 1);
866 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
867 sa = vdup_n_u8((src) >> 24);
871 // Use overlapping 8-pixel method, modified to avoid rewritten dest being reused
875 dstLine += dstStride;
877 maskLine += maskStride;
880 uint8x8_t mval, dval, res;
883 mval = vld1_u8((void *)mask);
884 dval = vld1_u8((void *)dst);
887 res = vqadd_u8(neon2mul(mval,sa),dval);
895 mval = vld1_u8((void *)mask);
896 dval = vld1_u8((void *)dst);
897 vst1_u8((void *)keep_dst, res);
900 res = vqadd_u8(neon2mul(mval,sa),dval);
906 vst1_u8((void *)keep_dst, res);
911 // Use 4/2/1 load/store method to handle 1-7 pixels
915 dstLine += dstStride;
917 maskLine += maskStride;
920 uint8x8_t mval=sa, dval=sa, res;
921 uint8_t *dst4=0, *dst2=0;
925 mval = vreinterpret_u8_u32(vld1_lane_u32((void *)mask, vreinterpret_u32_u8(mval), 1));
926 dval = vreinterpret_u8_u32(vld1_lane_u32((void *)dst, vreinterpret_u32_u8(dval), 1));
934 mval = vreinterpret_u8_u16(vld1_lane_u16((void *)mask, vreinterpret_u16_u8(mval), 1));
935 dval = vreinterpret_u8_u16(vld1_lane_u16((void *)dst, vreinterpret_u16_u8(dval), 1));
942 mval = vld1_lane_u8(mask, mval, 1);
943 dval = vld1_lane_u8(dst, dval, 1);
946 res = vqadd_u8(neon2mul(mval,sa),dval);
949 vst1_lane_u8(dst, res, 1);
951 vst1_lane_u16((void *)dst2, vreinterpret_u16_u8(res), 1);
953 vst1_lane_u32((void *)dst4, vreinterpret_u32_u8(res), 1);
958 #ifdef USE_GCC_INLINE_ASM
961 neon_CompositeSrc_16_16 (
962 pixman_implementation_t * impl,
964 pixman_image_t * src_image,
965 pixman_image_t * mask_image,
966 pixman_image_t * dst_image,
976 uint16_t *dstLine, *srcLine;
977 uint32_t dstStride, srcStride;
979 if(!height || !width)
982 /* We simply copy 16-bit-aligned pixels from one place to another. */
983 fbComposeGetStart (src_image, src_x, src_y, uint16_t, srcStride, srcLine, 1);
984 fbComposeGetStart (dst_image, dest_x, dest_y, uint16_t, dstStride, dstLine, 1);
986 /* Preload the first input scanline */
988 uint16_t *srcPtr = srcLine;
989 uint32_t count = width;
993 " subs %[count], %[count], #32 \n"
995 " add %[src], %[src], #64 \n"
998 // Clobbered input registers marked as input/outputs
999 : [src] "+r" (srcPtr), [count] "+r" (count)
1000 : // no unclobbered inputs
1006 uint16_t *dstPtr = dstLine;
1007 uint16_t *srcPtr = srcLine;
1008 uint32_t count = width;
1011 // Uses multi-register access and preloading to maximise bandwidth.
1012 // Each pixel is one halfword, so a quadword contains 8px.
1013 // Preload frequency assumed a 64-byte cacheline.
1015 " cmp %[count], #64 \n"
1016 " blt 1f @ skip oversized fragments \n"
1017 "0: @ start with eight quadwords at a time \n"
1018 " pld [%[src], %[srcStride], LSL #1] \n" // preload from next scanline
1019 " sub %[count], %[count], #64 \n"
1020 " vld1.16 {d16,d17,d18,d19}, [%[src]]! \n"
1021 " vld1.16 {d20,d21,d22,d23}, [%[src]]! \n"
1022 " pld [%[src], %[srcStride], LSL #1] \n" // preload from next scanline
1023 " vld1.16 {d24,d25,d26,d27}, [%[src]]! \n"
1024 " vld1.16 {d28,d29,d30,d31}, [%[src]]! \n"
1025 " cmp %[count], #64 \n"
1026 " vst1.16 {d16,d17,d18,d19}, [%[dst]]! \n"
1027 " vst1.16 {d20,d21,d22,d23}, [%[dst]]! \n"
1028 " vst1.16 {d24,d25,d26,d27}, [%[dst]]! \n"
1029 " vst1.16 {d28,d29,d30,d31}, [%[dst]]! \n"
1031 " cmp %[count], #0 \n"
1032 " beq 7f @ aligned fastpath \n"
1033 "1: @ four quadwords \n"
1034 " tst %[count], #32 \n"
1035 " beq 2f @ skip oversized fragment \n"
1036 " pld [%[src], %[srcStride], LSL #1] \n" // preload from next scanline
1037 " vld1.16 {d16,d17,d18,d19}, [%[src]]! \n"
1038 " vld1.16 {d20,d21,d22,d23}, [%[src]]! \n"
1039 " vst1.16 {d16,d17,d18,d19}, [%[dst]]! \n"
1040 " vst1.16 {d20,d21,d22,d23}, [%[dst]]! \n"
1041 "2: @ two quadwords \n"
1042 " tst %[count], #16 \n"
1043 " beq 3f @ skip oversized fragment \n"
1044 " pld [%[src], %[srcStride], LSL #1] \n" // preload from next scanline
1045 " vld1.16 {d16,d17,d18,d19}, [%[src]]! \n"
1046 " vst1.16 {d16,d17,d18,d19}, [%[dst]]! \n"
1047 "3: @ one quadword \n"
1048 " tst %[count], #8 \n"
1049 " beq 4f @ skip oversized fragment \n"
1050 " vld1.16 {d16,d17}, [%[src]]! \n"
1051 " vst1.16 {d16,d17}, [%[dst]]! \n"
1052 "4: @ one doubleword \n"
1053 " tst %[count], #4 \n"
1054 " beq 5f @ skip oversized fragment \n"
1055 " vld1.16 {d16}, [%[src]]! \n"
1056 " vst1.16 {d16}, [%[dst]]! \n"
1058 " tst %[count], #2 \n"
1059 " beq 6f @ skip oversized fragment \n"
1060 " ldr %[tmp], [%[src]], #4 \n"
1061 " str %[tmp], [%[dst]], #4 \n"
1062 "6: @ one halfword \n"
1063 " tst %[count], #1 \n"
1064 " beq 7f @ skip oversized fragment \n"
1065 " ldrh %[tmp], [%[src]] \n"
1066 " strh %[tmp], [%[dst]] \n"
1069 // Clobbered input registers marked as input/outputs
1070 : [dst] "+r" (dstPtr), [src] "+r" (srcPtr), [count] "+r" (count), [tmp] "+r" (tmp)
1072 // Unclobbered input
1073 : [srcStride] "r" (srcStride)
1075 // Clobbered vector registers
1076 // NB: these are the quad aliases of the double registers used in the asm
1077 : "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory"
1080 srcLine += srcStride;
1081 dstLine += dstStride;
1085 #endif /* USE_GCC_INLINE_ASM */
1088 neon_CompositeSrc_24_16 (
1089 pixman_implementation_t * impl,
1091 pixman_image_t * src_image,
1092 pixman_image_t * mask_image,
1093 pixman_image_t * dst_image,
1105 uint32_t dstStride, srcStride;
1107 if(!width || !height)
1110 /* We simply copy pixels from one place to another, assuming that the source's alpha is opaque. */
1111 fbComposeGetStart (src_image, src_x, src_y, uint32_t, srcStride, srcLine, 1);
1112 fbComposeGetStart (dst_image, dest_x, dest_y, uint16_t, dstStride, dstLine, 1);
1114 /* Preload the first input scanline */
1116 uint8_t *srcPtr = (uint8_t*) srcLine;
1117 uint32_t count = (width + 15) / 16;
1119 #ifdef USE_GCC_INLINE_ASM
1122 " subs %[count], %[count], #1 \n"
1124 " add %[src], %[src], #64 \n"
1127 // Clobbered input registers marked as input/outputs
1128 : [src] "+r" (srcPtr), [count] "+r" (count)
1129 : // no unclobbered inputs
1141 uint16_t *dstPtr = dstLine;
1142 uint32_t *srcPtr = srcLine;
1143 uint32_t count = width;
1144 const uint32_t RBmask = 0x1F;
1145 const uint32_t Gmask = 0x3F;
1147 // If you're going to complain about a goto, take a long hard look
1148 // at the massive blocks of assembler this skips over. ;-)
1152 #ifdef USE_GCC_INLINE_ASM
1154 // This is not as aggressive as the RGB565-source case.
1155 // Generally the source is in cached RAM when the formats are different, so we use preload.
1156 // We don't need to blend, so we are not reading from the uncached framebuffer.
1158 " cmp %[count], #16 \n"
1159 " blt 1f @ skip oversized fragments \n"
1160 "0: @ start with sixteen pixels at a time \n"
1161 " sub %[count], %[count], #16 \n"
1162 " pld [%[src], %[srcStride], lsl #2] @ preload from next scanline \n"
1163 " vld4.8 {d0,d1,d2,d3}, [%[src]]! @ d3 is alpha and ignored, d2-0 are rgb. \n"
1164 " vld4.8 {d4,d5,d6,d7}, [%[src]]! @ d7 is alpha and ignored, d6-4 are rgb. \n"
1165 " vshll.u8 q8, d2, #8 @ expand first red for repacking \n"
1166 " vshll.u8 q10, d1, #8 @ expand first green for repacking \n"
1167 " vshll.u8 q11, d0, #8 @ expand first blue for repacking \n"
1168 " vshll.u8 q9, d6, #8 @ expand second red for repacking \n"
1169 " vsri.u16 q8, q10, #5 @ insert first green after red \n"
1170 " vshll.u8 q10, d5, #8 @ expand second green for repacking \n"
1171 " vsri.u16 q8, q11, #11 @ insert first blue after green \n"
1172 " vshll.u8 q11, d4, #8 @ expand second blue for repacking \n"
1173 " vsri.u16 q9, q10, #5 @ insert second green after red \n"
1174 " vsri.u16 q9, q11, #11 @ insert second blue after green \n"
1175 " cmp %[count], #16 \n"
1176 " vst1.16 {d16,d17,d18,d19}, [%[dst]]! @ store 16 pixels \n"
1178 "1: @ end of main loop \n"
1179 " cmp %[count], #8 @ can we still do an 8-pixel block? \n"
1181 " sub %[count], %[count], #8 \n"
1182 " pld [%[src], %[srcStride], lsl #2] @ preload from next scanline \n"
1183 " vld4.8 {d0,d1,d2,d3}, [%[src]]! @ d3 is alpha and ignored, d2-0 are rgb. \n"
1184 " vshll.u8 q8, d2, #8 @ expand first red for repacking \n"
1185 " vshll.u8 q10, d1, #8 @ expand first green for repacking \n"
1186 " vshll.u8 q11, d0, #8 @ expand first blue for repacking \n"
1187 " vsri.u16 q8, q10, #5 @ insert first green after red \n"
1188 " vsri.u16 q8, q11, #11 @ insert first blue after green \n"
1189 " vst1.16 {d16,d17}, [%[dst]]! @ store 8 pixels \n"
1192 // Clobbered input and working registers marked as input/outputs
1193 : [dst] "+r" (dstPtr), [src] "+r" (srcPtr), [count] "+r" (count)
1195 // Unclobbered input
1196 : [srcStride] "r" (srcStride)
1198 // Clobbered vector registers
1199 // NB: these are the quad aliases of the double registers used in the asm
1200 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "cc", "memory"
1203 // A copy of the above code, in intrinsics-form.
1204 // This should be pretty self-documenting...
1205 while(count >= 16) {
1206 uint8x8x4_t pixelSetA, pixelSetB;
1207 uint16x8_t redA, greenA, blueA;
1208 uint16x8_t redB, greenB, blueB;
1209 uint16x8_t destPixelsA, destPixelsB;
1212 __pld(srcPtr + srcStride);
1213 pixelSetA = vld4_u8((uint8_t*)(srcPtr));
1214 pixelSetB = vld4_u8((uint8_t*)(srcPtr+8));
1217 redA = vshll_n_u8(pixelSetA.val[2], 8);
1218 greenA = vshll_n_u8(pixelSetA.val[1], 8);
1219 blueA = vshll_n_u8(pixelSetA.val[0], 8);
1220 redB = vshll_n_u8(pixelSetB.val[2], 8);
1221 greenB = vshll_n_u8(pixelSetB.val[1], 8);
1222 blueB = vshll_n_u8(pixelSetB.val[0], 8);
1223 destPixelsA = vsriq_n_u16(redA, greenA, 5);
1224 destPixelsB = vsriq_n_u16(redB, greenB, 5);
1225 destPixelsA = vsriq_n_u16(destPixelsA, blueA, 11);
1226 destPixelsB = vsriq_n_u16(destPixelsB, blueB, 11);
1228 // There doesn't seem to be an intrinsic for the double-quadword variant
1229 vst1q_u16(dstPtr , destPixelsA);
1230 vst1q_u16(dstPtr+8, destPixelsB);
1236 uint8x8x4_t pixelSetA;
1237 uint16x8_t redA, greenA, blueA;
1238 uint16x8_t destPixelsA;
1240 __pld(srcPtr + srcStride);
1242 pixelSetA = vld4_u8((uint8_t*)(srcPtr));
1245 redA = vshll_n_u8(pixelSetA.val[2], 8);
1246 greenA = vshll_n_u8(pixelSetA.val[1], 8);
1247 blueA = vshll_n_u8(pixelSetA.val[0], 8);
1248 destPixelsA = vsriq_n_u16(redA, greenA, 5);
1249 destPixelsA = vsriq_n_u16(destPixelsA, blueA, 11);
1251 vst1q_u16(dstPtr , destPixelsA);
1255 #endif // USE_GCC_INLINE_ASM
1260 __pld(srcPtr + srcStride);
1263 uint32_t srcPixelA = *srcPtr++;
1264 uint32_t srcPixelB = *srcPtr++;
1266 // ARM is really good at shift-then-ALU ops.
1267 // This should be a total of six shift-ANDs and five shift-ORs.
1268 uint32_t dstPixelsA;
1269 uint32_t dstPixelsB;
1271 dstPixelsA = ((srcPixelA >> 3) & RBmask);
1272 dstPixelsA |= ((srcPixelA >> 10) & Gmask) << 5;
1273 dstPixelsA |= ((srcPixelA >> 19) & RBmask) << 11;
1275 dstPixelsB = ((srcPixelB >> 3) & RBmask);
1276 dstPixelsB |= ((srcPixelB >> 10) & Gmask) << 5;
1277 dstPixelsB |= ((srcPixelB >> 19) & RBmask) << 11;
1279 // little-endian mode only
1280 *((uint32_t*) dstPtr) = dstPixelsA | (dstPixelsB << 16);
1286 uint32_t srcPixel = *srcPtr++;
1288 // ARM is really good at shift-then-ALU ops.
1289 // This block should end up as three shift-ANDs and two shift-ORs.
1290 uint32_t tmpBlue = (srcPixel >> 3) & RBmask;
1291 uint32_t tmpGreen = (srcPixel >> 10) & Gmask;
1292 uint32_t tmpRed = (srcPixel >> 19) & RBmask;
1293 uint16_t dstPixel = (tmpRed << 11) | (tmpGreen << 5) | tmpBlue;
1295 *dstPtr++ = dstPixel;
1299 srcLine += srcStride;
1300 dstLine += dstStride;
1305 static pixman_bool_t
1306 pixman_fill_neon (uint32_t *bits,
1315 uint32_t byte_stride, color;
1318 /* stride is always multiple of 32bit units in pixman */
1319 byte_stride = stride * sizeof(uint32_t);
1324 dst = ((char *) bits) + y * byte_stride + x;
1326 color = _xor << 24 | _xor << 16 | _xor << 8 | _xor;
1329 dst = ((char *) bits) + y * byte_stride + x * 2;
1331 color = _xor << 16 | _xor;
1332 width *= 2; /* width to bytes */
1335 dst = ((char *) bits) + y * byte_stride + x * 4;
1337 width *= 4; /* width to bytes */
1343 #ifdef USE_GCC_INLINE_ASM
1345 /* We have a special case for such small widths that don't allow
1346 us to use wide 128-bit stores anyway. We don't waste time
1347 trying to align writes, since there are only very few of them anyway */
1349 "cmp %[height], #0\n" /* Check if empty fill */
1351 "vdup.32 d0, %[color]\n" /* Fill the color to neon req */
1353 /* Check if we have a such width that can easily be handled by single
1354 operation for each scanline. This significantly reduces the number
1355 of test/branch instructions for each scanline */
1356 "cmp %[width], #8\n"
1358 "cmp %[width], #4\n"
1360 "cmp %[width], #2\n"
1363 /* Loop starts here for each scanline */
1365 "mov r4, %[dst]\n" /* Starting address of the current line */
1366 "tst %[width], #8\n"
1368 "vst1.8 {d0}, [r4]!\n"
1370 "tst %[width], #4\n"
1372 "str %[color], [r4], #4\n"
1374 "tst %[width], #2\n"
1376 "strh %[color], [r4], #2\n"
1378 "tst %[width], #1\n"
1380 "strb %[color], [r4], #1\n"
1383 "subs %[height], %[height], #1\n"
1384 "add %[dst], %[dst], %[byte_stride]\n"
1388 /* Special fillers for those widths that we can do with single operation */
1390 "subs %[height], %[height], #1\n"
1391 "vst1.8 {d0}, [%[dst]]\n"
1392 "add %[dst], %[dst], %[byte_stride]\n"
1397 "subs %[height], %[height], #1\n"
1398 "str %[color], [%[dst]]\n"
1399 "add %[dst], %[dst], %[byte_stride]\n"
1404 "subs %[height], %[height], #1\n"
1405 "strh %[color], [%[dst]]\n"
1406 "add %[dst], %[dst], %[byte_stride]\n"
1410 : /* No output members */
1411 : [color] "r" (color), [height] "r" (height), [width] "r" (width),
1412 [dst] "r" (dst) , [byte_stride] "r" (byte_stride)
1413 : "memory", "cc", "d0", "r4", "r5");
1416 "cmp %[height], #0\n" /* Check if empty fill */
1418 "vdup.32 q0, %[color]\n" /* Fill the color to neon req */
1420 /* Loop starts here for each scanline */
1422 "mov r4, %[dst]\n" /* Starting address of the current line */
1423 "mov r5, %[width]\n" /* We're going to write this many bytes */
1424 "ands r6, r4, #15\n" /* Are we at the 128-bit aligned address? */
1425 "beq 2f\n" /* Jump to the best case */
1427 /* We're not 128-bit aligned: However, we know that we can get to the
1428 next aligned location, since the fill is at least 16 bytes wide */
1429 "rsb r6, r6, #16\n" /* We would need to go forward this much */
1430 "sub r5, r5, r6\n" /* Update bytes left */
1433 "vst1.8 {d0[0]}, [r4]!\n"/* Store byte, now we are word aligned */
1437 "vst1.16 {d0[0]}, [r4, :16]!\n"/* Store half word, now we are 16-bit aligned */
1441 "vst1.32 {d0[0]}, [r4, :32]!\n"/* Store word, now we're 32-bit aligned */
1445 "vst1.64 {d0}, [r4, :64]!\n" /* Store qword now we're 64-bit aligned */
1447 /* The good case: We're 128-bit aligned for this scanline */
1449 "and r6, r5, #15\n" /* Number of tailing bytes */
1450 "cmp r5, r6\n" /* Do we have at least one qword to write? */
1451 "beq 6f\n" /* No, we just write the tail */
1452 "lsr r5, r5, #4\n" /* This many full qwords to write */
1454 /* The main block: Do 128-bit aligned writes */
1457 "vst1.64 {d0,d1}, [r4, :128]!\n"
1460 /* Handle the tailing bytes: Do 64, 32, 16 and 8-bit aligned writes as needed.
1461 We know that we're currently at 128-bit aligned address, so we can just
1462 pick the biggest operations that the remaining write width allows */
1468 "vst1.64 {d0}, [r4, :64]!\n"
1472 "vst1.32 {d0[0]}, [r4, :32]!\n"
1476 "vst1.16 {d0[0]}, [r4, :16]!\n"
1480 "vst1.8 {d0[0]}, [r4]!\n"
1483 /* Handle the next scanline */
1484 "subs %[height], %[height], #1\n"
1485 "add %[dst], %[dst], %[byte_stride]\n"
1488 : /* No output members */
1489 : [color] "r" (color), [height] "r" (height), [width] "r" (width),
1490 [dst] "r" (dst) , [byte_stride] "r" (byte_stride)
1491 : "memory", "cc", "q0", "d0", "d1", "r4", "r5", "r6");
1497 // TODO: intrinsic version for armcc
1504 // TODO: is there a more generic way of doing this being introduced?
1505 #define NEON_SCANLINE_BUFFER_PIXELS (1024)
1507 static inline void QuadwordCopy_neon(
1510 uint32_t count, // of quadwords
1511 uint32_t trailerCount // of bytes
1514 uint8_t *tDst = dst, *tSrc = src;
1516 // Uses aligned multi-register loads to maximise read bandwidth
1517 // on uncached memory such as framebuffers
1518 // The accesses do not have the aligned qualifiers, so that the copy
1519 // may convert between aligned-uncached and unaligned-cached memory.
1520 // It is assumed that the CPU can infer alignedness from the address.
1522 #ifdef USE_GCC_INLINE_ASM
1525 " cmp %[count], #8 \n"
1526 " blt 1f @ skip oversized fragments \n"
1527 "0: @ start with eight quadwords at a time \n"
1528 " sub %[count], %[count], #8 \n"
1529 " vld1.8 {d16,d17,d18,d19}, [%[src]]! \n"
1530 " vld1.8 {d20,d21,d22,d23}, [%[src]]! \n"
1531 " vld1.8 {d24,d25,d26,d27}, [%[src]]! \n"
1532 " vld1.8 {d28,d29,d30,d31}, [%[src]]! \n"
1533 " cmp %[count], #8 \n"
1534 " vst1.8 {d16,d17,d18,d19}, [%[dst]]! \n"
1535 " vst1.8 {d20,d21,d22,d23}, [%[dst]]! \n"
1536 " vst1.8 {d24,d25,d26,d27}, [%[dst]]! \n"
1537 " vst1.8 {d28,d29,d30,d31}, [%[dst]]! \n"
1539 "1: @ four quadwords \n"
1540 " tst %[count], #4 \n"
1541 " beq 2f @ skip oversized fragment \n"
1542 " vld1.8 {d16,d17,d18,d19}, [%[src]]! \n"
1543 " vld1.8 {d20,d21,d22,d23}, [%[src]]! \n"
1544 " vst1.8 {d16,d17,d18,d19}, [%[dst]]! \n"
1545 " vst1.8 {d20,d21,d22,d23}, [%[dst]]! \n"
1546 "2: @ two quadwords \n"
1547 " tst %[count], #2 \n"
1548 " beq 3f @ skip oversized fragment \n"
1549 " vld1.8 {d16,d17,d18,d19}, [%[src]]! \n"
1550 " vst1.8 {d16,d17,d18,d19}, [%[dst]]! \n"
1551 "3: @ one quadword \n"
1552 " tst %[count], #1 \n"
1553 " beq 4f @ skip oversized fragment \n"
1554 " vld1.8 {d16,d17}, [%[src]]! \n"
1555 " vst1.8 {d16,d17}, [%[dst]]! \n"
1558 // Clobbered input registers marked as input/outputs
1559 : [dst] "+r" (tDst), [src] "+r" (tSrc), [count] "+r" (count)
1561 // No unclobbered inputs
1564 // Clobbered vector registers
1565 // NB: these are the quad aliases of the double registers used in the asm
1566 : "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory"
1572 uint8x16x4_t t1 = vld4q_u8(tSrc);
1573 uint8x16x4_t t2 = vld4q_u8(tSrc + sizeof(uint8x16x4_t));
1574 tSrc += sizeof(uint8x16x4_t) * 2;
1576 vst4q_u8(tDst + sizeof(uint8x16x4_t), t2);
1577 tDst += sizeof(uint8x16x4_t) * 2;
1582 uint8x16x4_t t1 = vld4q_u8(tSrc);
1583 tSrc += sizeof(uint8x16x4_t);
1585 tDst += sizeof(uint8x16x4_t);
1589 uint8x8x4_t t1 = vld4_u8(tSrc);
1590 tSrc += sizeof(uint8x8x4_t);
1592 tDst += sizeof(uint8x8x4_t);
1596 uint8x16_t t1 = vld1q_u8(tSrc);
1597 tSrc += sizeof(uint8x16_t);
1599 tDst += sizeof(uint8x16_t);
1602 #endif // !USE_GCC_INLINE_ASM
1605 if(trailerCount & 8) {
1606 uint8x8_t t1 = vld1_u8(tSrc);
1607 tSrc += sizeof(uint8x8_t);
1609 tDst += sizeof(uint8x8_t);
1612 if(trailerCount & 4) {
1613 *((uint32_t*) tDst) = *((uint32_t*) tSrc);
1618 if(trailerCount & 2) {
1619 *((uint16_t*) tDst) = *((uint16_t*) tSrc);
1624 if(trailerCount & 1) {
1630 static inline void SolidOver565_8pix_neon(
1631 uint32_t glyphColour,
1634 uint32_t destStride, // bytes, not elements
1635 uint32_t maskStride,
1636 uint32_t count // 8-pixel groups
1639 // Inner loop of glyph blitter (solid colour, alpha mask)
1641 #ifdef USE_GCC_INLINE_ASM
1644 " vld4.8 {d20[],d21[],d22[],d23[]}, [%[glyphColour]] @ splat solid colour components \n"
1646 " vld1.16 {d0,d1}, [%[dest]] @ load first pixels from framebuffer \n"
1647 " vld1.8 {d17}, [%[inMask]] @ load alpha mask of glyph \n"
1648 " vmull.u8 q9, d17, d23 @ apply glyph colour alpha to mask \n"
1649 " vshrn.u16 d17, q9, #8 @ reformat it to match original mask \n"
1650 " vmvn d18, d17 @ we need the inverse mask for the background \n"
1651 " vsli.u16 q3, q0, #5 @ duplicate framebuffer blue bits \n"
1652 " vshrn.u16 d2, q0, #8 @ unpack red from framebuffer pixels \n"
1653 " vshrn.u16 d4, q0, #3 @ unpack green \n"
1654 " vsri.u8 d2, d2, #5 @ duplicate red bits (extend 5 to 8) \n"
1655 " vshrn.u16 d6, q3, #2 @ unpack extended blue (truncate 10 to 8) \n"
1656 " vsri.u8 d4, d4, #6 @ duplicate green bits (extend 6 to 8) \n"
1657 " vmull.u8 q1, d2, d18 @ apply inverse mask to background red... \n"
1658 " vmull.u8 q2, d4, d18 @ ...green... \n"
1659 " vmull.u8 q3, d6, d18 @ ...blue \n"
1660 " subs %[count], %[count], #1 @ decrement/test loop counter \n"
1661 " vmlal.u8 q1, d17, d22 @ add masked foreground red... \n"
1662 " vmlal.u8 q2, d17, d21 @ ...green... \n"
1663 " vmlal.u8 q3, d17, d20 @ ...blue \n"
1664 " add %[inMask], %[inMask], %[maskStride] @ advance mask pointer, while we wait \n"
1665 " vsri.16 q1, q2, #5 @ pack green behind red \n"
1666 " vsri.16 q1, q3, #11 @ pack blue into pixels \n"
1667 " vst1.16 {d2,d3}, [%[dest]] @ store composited pixels \n"
1668 " add %[dest], %[dest], %[destStride] @ advance framebuffer pointer \n"
1669 " bne 0b @ next please \n"
1671 // Clobbered registers marked as input/outputs
1672 : [dest] "+r" (dest), [inMask] "+r" (inMask), [count] "+r" (count)
1675 : [destStride] "r" (destStride), [maskStride] "r" (maskStride), [glyphColour] "r" (&glyphColour)
1677 // Clobbers, including the inputs we modify, and potentially lots of memory
1678 : "q0", "q1", "q2", "q3", "d17", "q9", "q10", "q11", "q12", "cc", "memory"
1683 uint8x8x4_t solidColour = vld4_dup_u8((uint8_t*) &glyphColour);
1687 uint16x8_t pixels = vld1q_u16(dest);
1688 uint8x8_t mask = vshrn_n_u16(vmull_u8(solidColour.val[3], vld1_u8(inMask)), 8);
1689 uint8x8_t iMask = vmvn_u8(mask);
1691 uint8x8_t tRed = vshrn_n_u16(pixels, 8);
1692 uint8x8_t tGreen = vshrn_n_u16(pixels, 3);
1693 uint8x8_t tBlue = vshrn_n_u16(vsli_n_u8(pixels, pixels, 5), 2);
1695 uint16x8_t sRed = vmull_u8(vsri_n_u8(tRed , tRed , 5), iMask);
1696 uint16x8_t sGreen = vmull_u8(vsri_n_u8(tGreen, tGreen, 6), iMask);
1697 uint16x8_t sBlue = vmull_u8( tBlue , iMask);
1699 sRed = vmlal(sRed , mask, solidColour.val[2]);
1700 sGreen = vmlal(sGreen, mask, solidColour.val[1]);
1701 sBlue = vmlal(sBlue , mask, solidColour.val[0]);
1703 pixels = vsri_n_u16(sRed, sGreen, 5);
1704 pixels = vsri_n_u16(pixels, sBlue, 11);
1705 vst1q_u16(dest, pixels);
1715 neon_CompositeOver_n_8_0565 (
1716 pixman_implementation_t * impl,
1718 pixman_image_t * src_image,
1719 pixman_image_t * mask_image,
1720 pixman_image_t * dst_image,
1731 uint16_t *dstLine, *alignedLine;
1733 uint32_t dstStride, maskStride;
1734 uint32_t kernelCount, copyCount, copyTail;
1735 uint8_t kernelOffset, copyOffset;
1737 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
1739 // bail out if fully transparent or degenerate
1743 if(width == 0 || height == 0)
1746 if(width > NEON_SCANLINE_BUFFER_PIXELS) {
1747 // split the blit, so we can use a fixed-size scanline buffer
1748 // TODO: there must be a more elegant way of doing this.
1750 for(x=0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS) {
1751 neon_CompositeOver_n_8_0565(impl, op, src_image, mask_image, dst_image, src_x+x, src_y, mask_x+x, mask_y, dest_x+x, dest_y,
1752 (x+NEON_SCANLINE_BUFFER_PIXELS > width) ? width-x : NEON_SCANLINE_BUFFER_PIXELS, height);
1757 fbComposeGetStart (dst_image, dest_x, dest_y, uint16_t, dstStride, dstLine, 1);
1758 fbComposeGetStart (mask_image, mask_x, mask_y, uint8_t, maskStride, maskLine, 1);
1760 // keep within minimum number of aligned quadwords on width
1761 // while also keeping the minimum number of columns to process
1763 unsigned long alignedLeft = (unsigned long)(dstLine) & ~0xF;
1764 unsigned long alignedRight = (((unsigned long)(dstLine + width)) + 0xF) & ~0xF;
1765 unsigned long ceilingLength = (((unsigned long) width) * sizeof(*dstLine) + 0xF) & ~0xF;
1767 // the fast copy should be quadword aligned
1768 copyOffset = dstLine - ((uint16_t*) alignedLeft);
1769 alignedLine = dstLine - copyOffset;
1770 copyCount = (uint32_t) ((alignedRight - alignedLeft) >> 4);
1773 if(alignedRight - alignedLeft > ceilingLength) {
1774 // unaligned routine is tightest
1775 kernelCount = (uint32_t) (ceilingLength >> 4);
1776 kernelOffset = copyOffset;
1778 // aligned routine is equally tight, so it is safer to align
1779 kernelCount = copyCount;
1783 // We should avoid reading beyond scanline ends for safety
1784 if(alignedLine < (dstLine - xDst) ||
1785 (alignedLine + (copyCount * 16 / sizeof(*dstLine))) > ((dstLine - xDst) + pDst->bits.width))
1787 // switch to precise read
1788 copyOffset = kernelOffset = 0;
1789 alignedLine = dstLine;
1790 kernelCount = (uint32_t) (ceilingLength >> 4);
1791 copyCount = (width * sizeof(*dstLine)) >> 4;
1792 copyTail = (width * sizeof(*dstLine)) & 0xF;
1797 uint16_t scanLine[NEON_SCANLINE_BUFFER_PIXELS + 8]; // deliberately not initialised
1798 uint8_t glyphLine[NEON_SCANLINE_BUFFER_PIXELS + 8];
1802 // left edge, middle block, right edge
1803 for( ; y--; maskLine += maskStride, alignedLine += dstStride, dstLine += dstStride) {
1804 // We don't want to overrun the edges of the glyph, so realign the edge data into known buffers
1805 QuadwordCopy_neon(glyphLine + copyOffset, maskLine, width >> 4, width & 0xF);
1807 // Uncached framebuffer access is really, really slow if we do it piecemeal.
1808 // It should be much faster if we grab it all at once.
1809 // One scanline should easily fit in L1 cache, so this should not waste RAM bandwidth.
1810 QuadwordCopy_neon(scanLine, alignedLine, copyCount, copyTail);
1812 // Apply the actual filter
1813 SolidOver565_8pix_neon(src, scanLine + kernelOffset, glyphLine + kernelOffset, 8 * sizeof(*dstLine), 8, kernelCount);
1815 // Copy the modified scanline back
1816 QuadwordCopy_neon(dstLine, scanLine + copyOffset, width >> 3, (width & 7) * 2);
1821 #ifdef USE_GCC_INLINE_ASM
1823 static inline void PlainOver565_8pix_neon(
1826 uint32_t destStride, // bytes, not elements
1827 uint32_t count // 8-pixel groups
1830 // Inner loop for plain translucent rects (solid colour without alpha mask)
1832 " vld4.8 {d20[],d21[],d22[],d23[]}, [%[colour]] @ solid colour load/splat \n"
1833 " vmull.u8 q12, d23, d22 @ premultiply alpha red \n"
1834 " vmull.u8 q13, d23, d21 @ premultiply alpha green \n"
1835 " vmull.u8 q14, d23, d20 @ premultiply alpha blue \n"
1836 " vmvn d18, d23 @ inverse alpha for background \n"
1838 " vld1.16 {d0,d1}, [%[dest]] @ load first pixels from framebuffer \n"
1839 " vshrn.u16 d2, q0, #8 @ unpack red from framebuffer pixels \n"
1840 " vshrn.u16 d4, q0, #3 @ unpack green \n"
1841 " vsli.u16 q3, q0, #5 @ duplicate framebuffer blue bits \n"
1842 " vsri.u8 d2, d2, #5 @ duplicate red bits (extend 5 to 8) \n"
1843 " vsri.u8 d4, d4, #6 @ duplicate green bits (extend 6 to 8) \n"
1844 " vshrn.u16 d6, q3, #2 @ unpack extended blue (truncate 10 to 8) \n"
1845 " vmov q0, q12 @ retrieve foreground red \n"
1846 " vmlal.u8 q0, d2, d18 @ blend red - my kingdom for a four-operand MLA \n"
1847 " vmov q1, q13 @ retrieve foreground green \n"
1848 " vmlal.u8 q1, d4, d18 @ blend green \n"
1849 " vmov q2, q14 @ retrieve foreground blue \n"
1850 " vmlal.u8 q2, d6, d18 @ blend blue \n"
1851 " subs %[count], %[count], #1 @ decrement/test loop counter \n"
1852 " vsri.16 q0, q1, #5 @ pack green behind red \n"
1853 " vsri.16 q0, q2, #11 @ pack blue into pixels \n"
1854 " vst1.16 {d0,d1}, [%[dest]] @ store composited pixels \n"
1855 " add %[dest], %[dest], %[destStride] @ advance framebuffer pointer \n"
1856 " bne 0b @ next please \n"
1858 // Clobbered registers marked as input/outputs
1859 : [dest] "+r" (dest), [count] "+r" (count)
1862 : [destStride] "r" (destStride), [colour] "r" (&colour)
1864 // Clobbers, including the inputs we modify, and potentially lots of memory
1865 : "q0", "q1", "q2", "q3", "q9", "q10", "q11", "q12", "q13", "q14", "cc", "memory"
1870 neon_CompositeOver_n_0565 (
1871 pixman_implementation_t * impl,
1873 pixman_image_t * src_image,
1874 pixman_image_t * mask_image,
1875 pixman_image_t * dst_image,
1886 uint16_t *dstLine, *alignedLine;
1888 uint32_t kernelCount, copyCount, copyTail;
1889 uint8_t kernelOffset, copyOffset;
1891 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
1893 // bail out if fully transparent
1897 if(width == 0 || height == 0)
1900 if(width > NEON_SCANLINE_BUFFER_PIXELS) {
1901 // split the blit, so we can use a fixed-size scanline buffer
1902 // TODO: there must be a more elegant way of doing this.
1904 for(x=0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS) {
1905 neon_CompositeOver_n_0565(impl, op, src_image, mask_image, dst_image, src_x+x, src_y, mask_x+x, mask_y, dest_x+x, dest_y,
1906 (x+NEON_SCANLINE_BUFFER_PIXELS > width) ? width-x : NEON_SCANLINE_BUFFER_PIXELS, height);
1911 fbComposeGetStart (dst_image, dest_x, dest_y, uint16_t, dstStride, dstLine, 1);
1913 // keep within minimum number of aligned quadwords on width
1914 // while also keeping the minimum number of columns to process
1916 unsigned long alignedLeft = (unsigned long)(dstLine) & ~0xF;
1917 unsigned long alignedRight = (((unsigned long)(dstLine + width)) + 0xF) & ~0xF;
1918 unsigned long ceilingLength = (((unsigned long) width) * sizeof(*dstLine) + 0xF) & ~0xF;
1920 // the fast copy should be quadword aligned
1921 copyOffset = dstLine - ((uint16_t*) alignedLeft);
1922 alignedLine = dstLine - copyOffset;
1923 copyCount = (uint32_t) ((alignedRight - alignedLeft) >> 4);
1926 if(alignedRight - alignedLeft > ceilingLength) {
1927 // unaligned routine is tightest
1928 kernelCount = (uint32_t) (ceilingLength >> 4);
1929 kernelOffset = copyOffset;
1931 // aligned routine is equally tight, so it is safer to align
1932 kernelCount = copyCount;
1936 // We should avoid reading beyond scanline ends for safety
1937 if(alignedLine < (dstLine - xDst) ||
1938 (alignedLine + (copyCount * 16 / sizeof(*dstLine))) > ((dstLine - xDst) + pDst->bits.width))
1940 // switch to precise read
1941 copyOffset = kernelOffset = 0;
1942 alignedLine = dstLine;
1943 kernelCount = (uint32_t) (ceilingLength >> 4);
1944 copyCount = (width * sizeof(*dstLine)) >> 4;
1945 copyTail = (width * sizeof(*dstLine)) & 0xF;
1950 uint16_t scanLine[NEON_SCANLINE_BUFFER_PIXELS + 8]; // deliberately not initialised
1953 // left edge, middle block, right edge
1954 for( ; height--; alignedLine += dstStride, dstLine += dstStride) {
1956 // Uncached framebuffer access is really, really slow if we do it piecemeal.
1957 // It should be much faster if we grab it all at once.
1958 // One scanline should easily fit in L1 cache, so this should not waste RAM bandwidth.
1959 QuadwordCopy_neon(scanLine, alignedLine, copyCount, copyTail);
1961 // Apply the actual filter
1962 PlainOver565_8pix_neon(src, scanLine + kernelOffset, 8 * sizeof(*dstLine), kernelCount);
1964 // Copy the modified scanline back
1965 QuadwordCopy_neon(dstLine, scanLine + copyOffset, width >> 3, (width & 7) * 2);
1970 static inline void ARGB8_Over565_8pix_neon(
1973 uint32_t srcStride, // bytes, not elements
1974 uint32_t count // 8-pixel groups
1979 " pld [%[src], %[srcStride]] @ preload from next scanline \n"
1980 " vld1.16 {d0,d1}, [%[dest]] @ load pixels from framebuffer \n"
1981 " vld4.8 {d20,d21,d22,d23},[%[src]]! @ load source image pixels \n"
1982 " vsli.u16 q3, q0, #5 @ duplicate framebuffer blue bits \n"
1983 " vshrn.u16 d2, q0, #8 @ unpack red from framebuffer pixels \n"
1984 " vshrn.u16 d4, q0, #3 @ unpack green \n"
1985 " vmvn d18, d23 @ we need the inverse alpha for the background \n"
1986 " vsri.u8 d2, d2, #5 @ duplicate red bits (extend 5 to 8) \n"
1987 " vshrn.u16 d6, q3, #2 @ unpack extended blue (truncate 10 to 8) \n"
1988 " vsri.u8 d4, d4, #6 @ duplicate green bits (extend 6 to 8) \n"
1989 " vmull.u8 q1, d2, d18 @ apply inverse alpha to background red... \n"
1990 " vmull.u8 q2, d4, d18 @ ...green... \n"
1991 " vmull.u8 q3, d6, d18 @ ...blue \n"
1992 " subs %[count], %[count], #1 @ decrement/test loop counter \n"
1993 " vmlal.u8 q1, d23, d22 @ add blended foreground red... \n"
1994 " vmlal.u8 q2, d23, d21 @ ...green... \n"
1995 " vmlal.u8 q3, d23, d20 @ ...blue \n"
1996 " vsri.16 q1, q2, #5 @ pack green behind red \n"
1997 " vsri.16 q1, q3, #11 @ pack blue into pixels \n"
1998 " vst1.16 {d2,d3}, [%[dest]]! @ store composited pixels \n"
1999 " bne 0b @ next please \n"
2001 // Clobbered registers marked as input/outputs
2002 : [dest] "+r" (dest), [src] "+r" (src), [count] "+r" (count)
2005 : [srcStride] "r" (srcStride)
2007 // Clobbers, including the inputs we modify, and potentially lots of memory
2008 : "q0", "q1", "q2", "q3", "d17", "d18", "q10", "q11", "cc", "memory"
2013 neon_CompositeOver_8888_0565 (
2014 pixman_implementation_t * impl,
2016 pixman_image_t * src_image,
2017 pixman_image_t * mask_image,
2018 pixman_image_t * dst_image,
2029 uint16_t *dstLine, *alignedLine;
2030 uint32_t dstStride, srcStride;
2031 uint32_t kernelCount, copyCount, copyTail;
2032 uint8_t kernelOffset, copyOffset;
2034 // we assume mask is opaque
2035 // so the only alpha to deal with is embedded in src
2037 if(width > NEON_SCANLINE_BUFFER_PIXELS) {
2038 // split the blit, so we can use a fixed-size scanline buffer
2040 for(x=0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS) {
2041 neon_CompositeOver_8888_0565(impl, op, src_image, mask_image, dst_image, src_x+x, src_y, mask_x+x, mask_y, dest_x+x, dest_y,
2042 (x+NEON_SCANLINE_BUFFER_PIXELS > width) ? width-x : NEON_SCANLINE_BUFFER_PIXELS, height);
2047 fbComposeGetStart (dst_image, dest_x, dest_y, uint16_t, dstStride, dstLine, 1);
2048 fbComposeGetStart (src_image, src_x, src_y, uint32_t, srcStride, srcLine, 1);
2050 // keep within minimum number of aligned quadwords on width
2051 // while also keeping the minimum number of columns to process
2053 unsigned long alignedLeft = (unsigned long)(dstLine) & ~0xF;
2054 unsigned long alignedRight = (((unsigned long)(dstLine + width)) + 0xF) & ~0xF;
2055 unsigned long ceilingLength = (((unsigned long) width) * sizeof(*dstLine) + 0xF) & ~0xF;
2057 // the fast copy should be quadword aligned
2058 copyOffset = dstLine - ((uint16_t*) alignedLeft);
2059 alignedLine = dstLine - copyOffset;
2060 copyCount = (uint32_t) ((alignedRight - alignedLeft) >> 4);
2063 if(alignedRight - alignedLeft > ceilingLength) {
2064 // unaligned routine is tightest
2065 kernelCount = (uint32_t) (ceilingLength >> 4);
2066 kernelOffset = copyOffset;
2068 // aligned routine is equally tight, so it is safer to align
2069 kernelCount = copyCount;
2073 // We should avoid reading beyond scanline ends for safety
2074 if(alignedLine < (dstLine - xDst) ||
2075 (alignedLine + (copyCount * 16 / sizeof(*dstLine))) > ((dstLine - xDst) + pDst->bits.width))
2077 // switch to precise read
2078 copyOffset = kernelOffset = 0;
2079 alignedLine = dstLine;
2080 kernelCount = (uint32_t) (ceilingLength >> 4);
2081 copyCount = (width * sizeof(*dstLine)) >> 4;
2082 copyTail = (width * sizeof(*dstLine)) & 0xF;
2086 /* Preload the first input scanline */
2088 uint8_t *srcPtr = (uint8_t*) srcLine;
2089 uint32_t count = (width + 15) / 16;
2091 #ifdef USE_GCC_INLINE_ASM
2094 " subs %[count], %[count], #1 \n"
2096 " add %[src], %[src], #64 \n"
2099 // Clobbered input registers marked as input/outputs
2100 : [src] "+r" (srcPtr), [count] "+r" (count)
2101 : // no unclobbered inputs
2113 uint16_t scanLine[NEON_SCANLINE_BUFFER_PIXELS + 8]; // deliberately not initialised
2116 // left edge, middle block, right edge
2117 for( ; height--; srcLine += srcStride, alignedLine += dstStride) {
2118 // Uncached framebuffer access is really, really slow if we do it piecemeal.
2119 // It should be much faster if we grab it all at once.
2120 // One scanline should easily fit in L1 cache, so this should not waste RAM bandwidth.
2121 QuadwordCopy_neon(scanLine, alignedLine, copyCount, copyTail);
2123 // Apply the actual filter
2124 ARGB8_Over565_8pix_neon(srcLine, scanLine + kernelOffset, srcStride * sizeof(*srcLine), kernelCount);
2126 // Copy the modified scanline back
2127 QuadwordCopy_neon(dstLine, scanLine + copyOffset, width >> 3, (width & 7) * 2);
2132 #endif // USE_GCC_INLINE_ASM
2134 static const pixman_fast_path_t arm_neon_fast_path_array[] =
2136 { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, neon_CompositeAdd_8888_8_8, 0 },
2137 { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, neon_CompositeAdd_8000_8000, 0 },
2138 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, neon_CompositeOver_n_8_0565, 0 },
2139 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, neon_CompositeOver_n_8_0565, 0 },
2140 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, neon_CompositeSrc_24_16, 0 },
2141 { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, neon_CompositeSrc_24_16, 0 },
2142 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, neon_CompositeSrc_24_16, 0 },
2143 { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, neon_CompositeSrc_24_16, 0 },
2144 #ifdef USE_GCC_INLINE_ASM
2145 { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, neon_CompositeSrc_16_16, 0 },
2146 { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, neon_CompositeSrc_16_16, 0 },
2147 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_r5g6b5, neon_CompositeOver_n_0565, 0 },
2148 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_b5g6r5, neon_CompositeOver_n_0565, 0 },
2149 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, neon_CompositeOver_8888_0565, 0 },
2150 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, neon_CompositeOver_8888_0565, 0 },
2152 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, neon_composite_over_8888_8888, 0 },
2153 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, neon_composite_over_8888_8888, 0 },
2154 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, neon_composite_over_8888_8888, 0 },
2155 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, neon_composite_over_8888_8888, 0 },
2156 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
2157 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
2158 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, neon_CompositeOver_n_8_8888, 0 },
2159 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, neon_CompositeOver_n_8_8888, 0 },
2160 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, neon_CompositeOver_n_8_8888, 0 },
2161 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, neon_CompositeOver_n_8_8888, 0 },
2165 const pixman_fast_path_t *const arm_neon_fast_paths = arm_neon_fast_path_array;
2168 arm_neon_composite (pixman_implementation_t *imp,
2170 pixman_image_t *src,
2171 pixman_image_t *mask,
2172 pixman_image_t *dest,
2182 if (_pixman_run_fast_path (arm_neon_fast_paths, imp,
2183 op, src, mask, dest,
2192 _pixman_implementation_composite (imp->delegate, op,
2200 static pixman_bool_t
2208 int src_x, int src_y,
2209 int dst_x, int dst_y,
2210 int width, int height)
2212 if(!width || !height)
2215 // accelerate only straight copies involving complete bytes
2216 if(src_bpp != dst_bpp || (src_bpp & 7))
2220 uint32_t bytes_per_pixel = src_bpp >> 3;
2221 uint32_t byte_width = width * bytes_per_pixel;
2222 int32_t src_stride_bytes = src_stride * 4; // parameter is in words for some reason
2223 int32_t dst_stride_bytes = dst_stride * 4;
2224 uint8_t *src_bytes = ((uint8_t*) src_bits) + src_y * src_stride_bytes + src_x * bytes_per_pixel;
2225 uint8_t *dst_bytes = ((uint8_t*) dst_bits) + dst_y * dst_stride_bytes + dst_x * bytes_per_pixel;
2226 uint32_t quadword_count = byte_width / 16;
2227 uint32_t offset = byte_width % 16;
2230 QuadwordCopy_neon(dst_bytes, src_bytes, quadword_count, offset);
2231 src_bytes += src_stride_bytes;
2232 dst_bytes += dst_stride_bytes;
2239 static pixman_bool_t
2240 arm_neon_blt (pixman_implementation_t *imp,
2247 int src_x, int src_y,
2248 int dst_x, int dst_y,
2249 int width, int height)
2251 if (pixman_blt_neon (
2252 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
2253 src_x, src_y, dst_x, dst_y, width, height))
2256 return _pixman_implementation_blt (
2258 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
2259 src_x, src_y, dst_x, dst_y, width, height);
2262 static pixman_bool_t
2263 arm_neon_fill (pixman_implementation_t *imp,
2273 if (pixman_fill_neon (bits, stride, bpp, x, y, width, height, xor))
2276 return _pixman_implementation_fill (
2277 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
2280 pixman_implementation_t *
2281 _pixman_implementation_create_arm_neon (void)
2283 pixman_implementation_t *simd = _pixman_implementation_create_arm_simd();
2284 pixman_implementation_t *imp = _pixman_implementation_create (simd);
2286 imp->composite = arm_neon_composite;
2287 imp->blt = arm_neon_blt;
2288 imp->fill = arm_neon_fill;