2 * Copyright © 2009 ARM Ltd, Movial Creative Technologies Oy
4 * Permission to use, copy, modify, distribute, and sell this software and its
5 * documentation for any purpose is hereby granted without fee, provided that
6 * the above copyright notice appear in all copies and that both that
7 * copyright notice and this permission notice appear in supporting
8 * documentation, and that the name of ARM Ltd not be used in
9 * advertising or publicity pertaining to distribution of the software without
10 * specific, written prior permission. ARM Ltd makes no
11 * representations about the suitability of this software for any purpose. It
12 * is provided "as is" without express or implied warranty.
14 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
15 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
16 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
17 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
19 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
20 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23 * Author: Ian Rickards (ian.rickards@arm.com)
24 * Author: Jonathan Morton (jonathan.morton@movial.com)
25 * Author: Markku Vire (markku.vire@movial.com)
35 #include "pixman-private.h"
37 // Deal with an intrinsic that is defined differently in GCC
38 #if !defined(__ARMCC_VERSION) && !defined(__pld)
39 #define __pld(_x) __builtin_prefetch(_x)
42 static force_inline uint8x8x4_t unpack0565(uint16x8_t rgb)
47 res.val[3] = vdup_n_u8(0);
48 gb = vshrq_n_u16(rgb, 5);
49 b = vshrq_n_u16(rgb, 5+6);
50 res.val[0] = vmovn_u16(rgb); // get low 5 bits
51 res.val[1] = vmovn_u16(gb); // get mid 6 bits
52 res.val[2] = vmovn_u16(b); // get top 5 bits
54 res.val[0] = vshl_n_u8(res.val[0], 3); // shift to top
55 res.val[1] = vshl_n_u8(res.val[1], 2); // shift to top
56 res.val[2] = vshl_n_u8(res.val[2], 3); // shift to top
58 res.val[0] = vsri_n_u8(res.val[0], res.val[0], 5);
59 res.val[1] = vsri_n_u8(res.val[1], res.val[1], 6);
60 res.val[2] = vsri_n_u8(res.val[2], res.val[2], 5);
65 static force_inline uint16x8_t pack0565(uint8x8x4_t s)
67 uint16x8_t rgb, val_g, val_r;
69 rgb = vshll_n_u8(s.val[2],8);
70 val_g = vshll_n_u8(s.val[1],8);
71 val_r = vshll_n_u8(s.val[0],8);
72 rgb = vsriq_n_u16(rgb, val_g, 5);
73 rgb = vsriq_n_u16(rgb, val_r, 5+6);
78 static force_inline uint8x8_t neon2mul(uint8x8_t x, uint8x8_t alpha)
83 tmp = vmull_u8(x,alpha);
84 tmp2 = vrshrq_n_u16(tmp,8);
85 res = vraddhn_u16(tmp,tmp2);
90 static force_inline uint8x8x4_t neon8mul(uint8x8x4_t x, uint8x8_t alpha)
94 uint16x8_t qtmp1,qtmp2;
96 tmp.val[0] = vmull_u8(x.val[0],alpha);
97 tmp.val[1] = vmull_u8(x.val[1],alpha);
98 tmp.val[2] = vmull_u8(x.val[2],alpha);
99 tmp.val[3] = vmull_u8(x.val[3],alpha);
101 qtmp1 = vrshrq_n_u16(tmp.val[0],8);
102 qtmp2 = vrshrq_n_u16(tmp.val[1],8);
103 res.val[0] = vraddhn_u16(tmp.val[0],qtmp1);
104 qtmp1 = vrshrq_n_u16(tmp.val[2],8);
105 res.val[1] = vraddhn_u16(tmp.val[1],qtmp2);
106 qtmp2 = vrshrq_n_u16(tmp.val[3],8);
107 res.val[2] = vraddhn_u16(tmp.val[2],qtmp1);
108 res.val[3] = vraddhn_u16(tmp.val[3],qtmp2);
113 static force_inline uint8x8x4_t neon8qadd(uint8x8x4_t x, uint8x8x4_t y)
117 res.val[0] = vqadd_u8(x.val[0],y.val[0]);
118 res.val[1] = vqadd_u8(x.val[1],y.val[1]);
119 res.val[2] = vqadd_u8(x.val[2],y.val[2]);
120 res.val[3] = vqadd_u8(x.val[3],y.val[3]);
127 neon_composite_add_8000_8000 (
128 pixman_implementation_t * impl,
130 pixman_image_t * src_image,
131 pixman_image_t * mask_image,
132 pixman_image_t * dst_image,
142 uint8_t *dst_line, *dst;
143 uint8_t *src_line, *src;
144 int dst_stride, src_stride;
147 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
148 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
152 // Use overlapping 8-pixel method
156 dst_line += dst_stride;
158 src_line += src_stride;
163 #ifndef USE_GCC_INLINE_ASM
164 uint8x8_t sval,dval,temp;
166 sval = vld1_u8((void*)src);
167 dval = vld1_u8((void*)dst);
170 temp = vqadd_u8(dval,sval);
178 sval = vld1_u8((void*)src);
179 dval = vld1_u8((void*)dst);
181 vst1_u8((void*)keep_dst,temp);
184 temp = vqadd_u8(dval,sval);
190 vst1_u8((void*)keep_dst,temp);
193 // avoid using d8-d15 (q4-q7) aapcs callee-save registers
194 "vld1.8 {d0}, [%[src]]\n\t"
195 "vld1.8 {d4}, [%[dst]]\n\t"
196 "mov %[keep_dst], %[dst]\n\t"
198 "and ip, %[w], #7\n\t"
199 "add %[src], %[src], ip\n\t"
200 "add %[dst], %[dst], ip\n\t"
201 "subs %[w], %[w], ip\n\t"
205 "vld1.8 {d0}, [%[src]]!\n\t"
206 "vld1.8 {d4}, [%[dst]]!\n\t"
207 "vst1.8 {d20}, [%[keep_dst]]\n\t"
208 "sub %[keep_dst], %[dst], #8\n\t"
209 "subs %[w], %[w], #8\n\t"
211 "vqadd.u8 d20, d0, d4\n\t"
216 "vst1.8 {d20}, [%[keep_dst]]\n\t"
218 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
220 : "ip", "cc", "memory", "d0","d4",
228 const uint8_t nil = 0;
229 const uint8x8_t vnil = vld1_dup_u8(&nil);
234 dst_line += dst_stride;
236 src_line += src_stride;
238 uint8x8_t sval=vnil, dval=vnil;
239 uint8_t *dst4=0, *dst2=0;
243 sval = vreinterpret_u8_u32(vld1_lane_u32((void*)src,vreinterpret_u32_u8(sval),1));
244 dval = vreinterpret_u8_u32(vld1_lane_u32((void*)dst,vreinterpret_u32_u8(dval),1));
251 sval = vreinterpret_u8_u16(vld1_lane_u16((void*)src,vreinterpret_u16_u8(sval),1));
252 dval = vreinterpret_u8_u16(vld1_lane_u16((void*)dst,vreinterpret_u16_u8(dval),1));
259 sval = vld1_lane_u8(src,sval,1);
260 dval = vld1_lane_u8(dst,dval,1);
263 dval = vqadd_u8(dval,sval);
266 vst1_lane_u8(dst,dval,1);
268 vst1_lane_u16((void*)dst2,vreinterpret_u16_u8(dval),1);
270 vst1_lane_u32((void*)dst4,vreinterpret_u32_u8(dval),1);
277 neon_composite_over_8888_8888 (
278 pixman_implementation_t * impl,
280 pixman_image_t * src_image,
281 pixman_image_t * mask_image,
282 pixman_image_t * dst_image,
292 uint32_t *dst_line, *dst;
293 uint32_t *src_line, *src;
294 int dst_stride, src_stride;
297 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
298 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
302 // Use overlapping 8-pixel method
306 dst_line += dst_stride;
308 src_line += src_stride;
311 uint32_t *keep_dst=0;
313 #ifndef USE_GCC_INLINE_ASM
314 uint8x8x4_t sval,dval,temp;
316 sval = vld4_u8((void*)src);
317 dval = vld4_u8((void*)dst);
320 temp = neon8mul(dval,vmvn_u8(sval.val[3]));
321 temp = neon8qadd(sval,temp);
329 sval = vld4_u8((void*)src);
330 dval = vld4_u8((void*)dst);
332 vst4_u8((void*)keep_dst,temp);
335 temp = neon8mul(dval,vmvn_u8(sval.val[3]));
336 temp = neon8qadd(sval,temp);
342 vst4_u8((void*)keep_dst,temp);
345 // avoid using d8-d15 (q4-q7) aapcs callee-save registers
346 "vld4.8 {d0-d3}, [%[src]]\n\t"
347 "vld4.8 {d4-d7}, [%[dst]]\n\t"
348 "mov %[keep_dst], %[dst]\n\t"
350 "and ip, %[w], #7\n\t"
351 "add %[src], %[src], ip, LSL#2\n\t"
352 "add %[dst], %[dst], ip, LSL#2\n\t"
353 "subs %[w], %[w], ip\n\t"
357 "vld4.8 {d0-d3}, [%[src]]!\n\t"
358 "vld4.8 {d4-d7}, [%[dst]]!\n\t"
359 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
360 "sub %[keep_dst], %[dst], #8*4\n\t"
361 "subs %[w], %[w], #8\n\t"
364 "vmull.u8 q10, d31, d4\n\t"
365 "vmull.u8 q11, d31, d5\n\t"
366 "vmull.u8 q12, d31, d6\n\t"
367 "vmull.u8 q13, d31, d7\n\t"
368 "vrshr.u16 q8, q10, #8\n\t"
369 "vrshr.u16 q9, q11, #8\n\t"
370 "vraddhn.u16 d20, q10, q8\n\t"
371 "vraddhn.u16 d21, q11, q9\n\t"
372 "vrshr.u16 q8, q12, #8\n\t"
373 "vrshr.u16 q9, q13, #8\n\t"
374 "vraddhn.u16 d22, q12, q8\n\t"
375 "vraddhn.u16 d23, q13, q9\n\t"
377 "vqadd.u8 d20, d0, d20\n\t"
378 "vqadd.u8 d21, d1, d21\n\t"
379 "vqadd.u8 d22, d2, d22\n\t"
380 "vqadd.u8 d23, d3, d23\n\t"
385 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
387 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
389 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
390 "d16","d17","d18","d19","d20","d21","d22","d23"
397 uint8x8_t alpha_selector=vreinterpret_u8_u64(vcreate_u64(0x0707070703030303ULL));
403 dst_line += dst_stride;
405 src_line += src_stride;
412 /* two 32-bit pixels packed into D-reg; ad-hoc vectorization */
413 sval = vreinterpret_u8_u32(vld1_u32((void*)src));
414 dval = vreinterpret_u8_u32(vld1_u32((void*)dst));
415 dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval),alpha_selector));
416 vst1_u8((void*)dst,vqadd_u8(sval,dval));
427 /* single 32-bit pixel in lane 0 */
428 sval = vreinterpret_u8_u32(vld1_dup_u32((void*)src)); // only interested in lane 0
429 dval = vreinterpret_u8_u32(vld1_dup_u32((void*)dst)); // only interested in lane 0
430 dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval),alpha_selector));
431 vst1_lane_u32((void*)dst,vreinterpret_u32_u8(vqadd_u8(sval,dval)),0);
438 neon_composite_over_8888_n_8888 (
439 pixman_implementation_t * impl,
441 pixman_image_t * src_image,
442 pixman_image_t * mask_image,
443 pixman_image_t * dst_image,
453 uint32_t *dst_line, *dst;
454 uint32_t *src_line, *src;
456 int dst_stride, src_stride;
458 uint8x8_t mask_alpha;
460 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
461 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
463 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
464 mask_alpha = vdup_n_u8((mask) >> 24);
468 // Use overlapping 8-pixel method
472 dst_line += dst_stride;
474 src_line += src_stride;
477 uint32_t *keep_dst=0;
479 #ifndef USE_GCC_INLINE_ASM
480 uint8x8x4_t sval,dval,temp;
482 sval = vld4_u8((void*)src);
483 dval = vld4_u8((void*)dst);
486 sval = neon8mul(sval,mask_alpha);
487 temp = neon8mul(dval,vmvn_u8(sval.val[3]));
488 temp = neon8qadd(sval,temp);
496 sval = vld4_u8((void*)src);
497 dval = vld4_u8((void*)dst);
499 vst4_u8((void*)keep_dst,temp);
502 sval = neon8mul(sval,mask_alpha);
503 temp = neon8mul(dval,vmvn_u8(sval.val[3]));
504 temp = neon8qadd(sval,temp);
510 vst4_u8((void*)keep_dst,temp);
513 // avoid using d8-d15 (q4-q7) aapcs callee-save registers
514 "vdup.32 d30, %[mask]\n\t"
515 "vdup.8 d30, d30[3]\n\t"
517 "vld4.8 {d0-d3}, [%[src]]\n\t"
518 "vld4.8 {d4-d7}, [%[dst]]\n\t"
519 "mov %[keep_dst], %[dst]\n\t"
521 "and ip, %[w], #7\n\t"
522 "add %[src], %[src], ip, LSL#2\n\t"
523 "add %[dst], %[dst], ip, LSL#2\n\t"
524 "subs %[w], %[w], ip\n\t"
528 "vld4.8 {d0-d3}, [%[src]]!\n\t"
529 "vld4.8 {d4-d7}, [%[dst]]!\n\t"
530 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
531 "sub %[keep_dst], %[dst], #8*4\n\t"
532 "subs %[w], %[w], #8\n\t"
535 "vmull.u8 q10, d30, d0\n\t"
536 "vmull.u8 q11, d30, d1\n\t"
537 "vmull.u8 q12, d30, d2\n\t"
538 "vmull.u8 q13, d30, d3\n\t"
539 "vrshr.u16 q8, q10, #8\n\t"
540 "vrshr.u16 q9, q11, #8\n\t"
541 "vraddhn.u16 d0, q10, q8\n\t"
542 "vraddhn.u16 d1, q11, q9\n\t"
543 "vrshr.u16 q9, q13, #8\n\t"
544 "vrshr.u16 q8, q12, #8\n\t"
545 "vraddhn.u16 d3, q13, q9\n\t"
546 "vraddhn.u16 d2, q12, q8\n\t"
549 "vmull.u8 q10, d31, d4\n\t"
550 "vmull.u8 q11, d31, d5\n\t"
551 "vmull.u8 q12, d31, d6\n\t"
552 "vmull.u8 q13, d31, d7\n\t"
553 "vrshr.u16 q8, q10, #8\n\t"
554 "vrshr.u16 q9, q11, #8\n\t"
555 "vraddhn.u16 d20, q10, q8\n\t"
556 "vrshr.u16 q8, q12, #8\n\t"
557 "vraddhn.u16 d21, q11, q9\n\t"
558 "vrshr.u16 q9, q13, #8\n\t"
559 "vraddhn.u16 d22, q12, q8\n\t"
560 "vraddhn.u16 d23, q13, q9\n\t"
562 "vqadd.u8 d20, d0, d20\n\t"
563 "vqadd.u8 d21, d1, d21\n\t"
564 "vqadd.u8 d22, d2, d22\n\t"
565 "vqadd.u8 d23, d3, d23\n\t"
570 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
572 : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
574 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
575 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27",
583 uint8x8_t alpha_selector=vreinterpret_u8_u64(vcreate_u64(0x0707070703030303ULL));
589 dst_line += dst_stride;
591 src_line += src_stride;
598 sval = vreinterpret_u8_u32(vld1_u32((void*)src));
599 dval = vreinterpret_u8_u32(vld1_u32((void*)dst));
601 /* sval * const alpha_mul */
602 sval = neon2mul(sval,mask_alpha);
604 /* dval * 255-(src alpha) */
605 dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval), alpha_selector));
607 vst1_u8((void*)dst,vqadd_u8(sval,dval));
618 sval = vreinterpret_u8_u32(vld1_dup_u32((void*)src));
619 dval = vreinterpret_u8_u32(vld1_dup_u32((void*)dst));
621 /* sval * const alpha_mul */
622 sval = neon2mul(sval,mask_alpha);
624 /* dval * 255-(src alpha) */
625 dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval), alpha_selector));
627 vst1_lane_u32((void*)dst,vreinterpret_u32_u8(vqadd_u8(sval,dval)),0);
635 neon_composite_over_n_8_8888 (
636 pixman_implementation_t * impl,
638 pixman_image_t * src_image,
639 pixman_image_t * mask_image,
640 pixman_image_t * dst_image,
651 uint32_t *dst_line, *dst;
652 uint8_t *mask_line, *mask;
653 int dst_stride, mask_stride;
657 uint8x8_t mask_selector=vreinterpret_u8_u64(vcreate_u64(0x0101010100000000ULL));
658 uint8x8_t alpha_selector=vreinterpret_u8_u64(vcreate_u64(0x0707070703030303ULL));
660 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
662 // bail out if fully transparent
667 sval2=vreinterpret_u8_u32(vdup_n_u32(src));
668 sval8.val[0]=vdup_lane_u8(sval2,0);
669 sval8.val[1]=vdup_lane_u8(sval2,1);
670 sval8.val[2]=vdup_lane_u8(sval2,2);
671 sval8.val[3]=vdup_lane_u8(sval2,3);
673 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
674 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
678 // Use overlapping 8-pixel method, modified to avoid rewritten dest being reused
681 uint32_t *keep_dst=0;
684 dst_line += dst_stride;
686 mask_line += mask_stride;
689 #ifndef USE_GCC_INLINE_ASM
691 uint8x8x4_t dval, temp;
693 alpha = vld1_u8((void*)mask);
694 dval = vld4_u8((void*)dst);
697 temp = neon8mul(sval8,alpha);
698 dval = neon8mul(dval,vmvn_u8(temp.val[3]));
699 temp = neon8qadd(temp,dval);
707 alpha = vld1_u8((void*)mask);
708 dval = vld4_u8((void*)dst);
710 vst4_u8((void*)keep_dst,temp);
713 temp = neon8mul(sval8,alpha);
714 dval = neon8mul(dval,vmvn_u8(temp.val[3]));
715 temp = neon8qadd(temp,dval);
721 vst4_u8((void*)keep_dst,temp);
724 "vdup.32 d0, %[src]\n\t"
725 "vdup.8 d1, d0[1]\n\t"
726 "vdup.8 d2, d0[2]\n\t"
727 "vdup.8 d3, d0[3]\n\t"
728 "vdup.8 d0, d0[0]\n\t"
730 "vld4.8 {d4-d7}, [%[dst]]\n\t"
731 "vld1.8 {d31}, [%[mask]]\n\t"
732 "mov %[keep_dst], %[dst]\n\t"
734 "and ip, %[w], #7\n\t"
735 "add %[mask], %[mask], ip\n\t"
736 "add %[dst], %[dst], ip, LSL#2\n\t"
737 "subs %[w], %[w], ip\n\t"
741 "vld4.8 {d4-d7}, [%[dst]]!\n\t"
742 "vld1.8 {d31}, [%[mask]]!\n\t"
743 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
744 "sub %[keep_dst], %[dst], #8*4\n\t"
745 "subs %[w], %[w], #8\n\t"
748 "vmull.u8 q10, d31, d0\n\t"
749 "vmull.u8 q11, d31, d1\n\t"
750 "vmull.u8 q12, d31, d2\n\t"
751 "vmull.u8 q13, d31, d3\n\t"
752 "vrshr.u16 q8, q10, #8\n\t"
753 "vrshr.u16 q9, q11, #8\n\t"
754 "vraddhn.u16 d20, q10, q8\n\t"
755 "vraddhn.u16 d21, q11, q9\n\t"
756 "vrshr.u16 q9, q13, #8\n\t"
757 "vrshr.u16 q8, q12, #8\n\t"
758 "vraddhn.u16 d23, q13, q9\n\t"
759 "vraddhn.u16 d22, q12, q8\n\t"
761 "vmvn.8 d30, d23\n\t"
762 "vmull.u8 q12, d30, d4\n\t"
763 "vmull.u8 q13, d30, d5\n\t"
764 "vmull.u8 q14, d30, d6\n\t"
765 "vmull.u8 q15, d30, d7\n\t"
767 "vrshr.u16 q8, q12, #8\n\t"
768 "vrshr.u16 q9, q13, #8\n\t"
769 "vraddhn.u16 d4, q12, q8\n\t"
770 "vrshr.u16 q8, q14, #8\n\t"
771 "vraddhn.u16 d5, q13, q9\n\t"
772 "vrshr.u16 q9, q15, #8\n\t"
773 "vraddhn.u16 d6, q14, q8\n\t"
774 "vraddhn.u16 d7, q15, q9\n\t"
777 "vqadd.u8 d20, d4, d20\n\t"
778 "vqadd.u8 d21, d5, d21\n\t"
779 "vqadd.u8 d22, d6, d22\n\t"
780 "vqadd.u8 d23, d7, d23\n\t"
785 "vst4.8 {d20-d23}, [%[keep_dst]]\n\t"
787 : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
789 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
790 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
803 dst_line += dst_stride;
805 mask_line += mask_stride;
810 uint8x8_t dval, temp, res;
812 alpha = vtbl1_u8(vreinterpret_u8_u16(vld1_dup_u16((void*)mask)), mask_selector);
813 dval = vld1_u8((void*)dst);
815 temp = neon2mul(sval2,alpha);
816 res = vqadd_u8(temp,neon2mul(dval,vtbl1_u8(vmvn_u8(temp), alpha_selector)));
818 vst1_u8((void*)dst,res);
826 uint8x8_t dval, temp, res;
828 alpha = vtbl1_u8(vld1_dup_u8((void*)mask), mask_selector);
829 dval = vreinterpret_u8_u32(vld1_dup_u32((void*)dst));
831 temp = neon2mul(sval2,alpha);
832 res = vqadd_u8(temp,neon2mul(dval,vtbl1_u8(vmvn_u8(temp), alpha_selector)));
834 vst1_lane_u32((void*)dst,vreinterpret_u32_u8(res),0);
842 neon_composite_add_8888_8_8 (
843 pixman_implementation_t * impl,
845 pixman_image_t * src_image,
846 pixman_image_t * mask_image,
847 pixman_image_t * dst_image,
857 uint8_t *dst_line, *dst;
858 uint8_t *mask_line, *mask;
859 int dst_stride, mask_stride;
864 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
865 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
866 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
867 sa = vdup_n_u8((src) >> 24);
871 // Use overlapping 8-pixel method, modified to avoid rewritten dest being reused
875 dst_line += dst_stride;
877 mask_line += mask_stride;
880 uint8x8_t mval, dval, res;
883 mval = vld1_u8((void *)mask);
884 dval = vld1_u8((void *)dst);
887 res = vqadd_u8(neon2mul(mval,sa),dval);
895 mval = vld1_u8((void *)mask);
896 dval = vld1_u8((void *)dst);
897 vst1_u8((void *)keep_dst, res);
900 res = vqadd_u8(neon2mul(mval,sa),dval);
906 vst1_u8((void *)keep_dst, res);
911 // Use 4/2/1 load/store method to handle 1-7 pixels
915 dst_line += dst_stride;
917 mask_line += mask_stride;
920 uint8x8_t mval=sa, dval=sa, res;
921 uint8_t *dst4=0, *dst2=0;
925 mval = vreinterpret_u8_u32(vld1_lane_u32((void *)mask, vreinterpret_u32_u8(mval), 1));
926 dval = vreinterpret_u8_u32(vld1_lane_u32((void *)dst, vreinterpret_u32_u8(dval), 1));
934 mval = vreinterpret_u8_u16(vld1_lane_u16((void *)mask, vreinterpret_u16_u8(mval), 1));
935 dval = vreinterpret_u8_u16(vld1_lane_u16((void *)dst, vreinterpret_u16_u8(dval), 1));
942 mval = vld1_lane_u8(mask, mval, 1);
943 dval = vld1_lane_u8(dst, dval, 1);
946 res = vqadd_u8(neon2mul(mval,sa),dval);
949 vst1_lane_u8(dst, res, 1);
951 vst1_lane_u16((void *)dst2, vreinterpret_u16_u8(res), 1);
953 vst1_lane_u32((void *)dst4, vreinterpret_u32_u8(res), 1);
958 #ifdef USE_GCC_INLINE_ASM
961 neon_composite_src_16_16 (
962 pixman_implementation_t * impl,
964 pixman_image_t * src_image,
965 pixman_image_t * mask_image,
966 pixman_image_t * dst_image,
976 uint16_t *dst_line, *src_line;
977 uint32_t dst_stride, src_stride;
979 if(!height || !width)
982 /* We simply copy 16-bit-aligned pixels from one place to another. */
983 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
984 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
986 /* Preload the first input scanline */
988 uint16_t *src_ptr = src_line;
989 uint32_t count = width;
993 " subs %[count], %[count], #32 \n"
995 " add %[src], %[src], #64 \n"
998 // Clobbered input registers marked as input/outputs
999 : [src] "+r" (src_ptr), [count] "+r" (count)
1000 : // no unclobbered inputs
1006 uint16_t *dst_ptr = dst_line;
1007 uint16_t *src_ptr = src_line;
1008 uint32_t count = width;
1011 // Uses multi-register access and preloading to maximise bandwidth.
1012 // Each pixel is one halfword, so a quadword contains 8px.
1013 // Preload frequency assumed a 64-byte cacheline.
1015 " cmp %[count], #64 \n"
1016 " blt 1f @ skip oversized fragments \n"
1017 "0: @ start with eight quadwords at a time \n"
1018 " pld [%[src], %[src_stride], LSL #1] \n" // preload from next scanline
1019 " sub %[count], %[count], #64 \n"
1020 " vld1.16 {d16,d17,d18,d19}, [%[src]]! \n"
1021 " vld1.16 {d20,d21,d22,d23}, [%[src]]! \n"
1022 " pld [%[src], %[src_stride], LSL #1] \n" // preload from next scanline
1023 " vld1.16 {d24,d25,d26,d27}, [%[src]]! \n"
1024 " vld1.16 {d28,d29,d30,d31}, [%[src]]! \n"
1025 " cmp %[count], #64 \n"
1026 " vst1.16 {d16,d17,d18,d19}, [%[dst]]! \n"
1027 " vst1.16 {d20,d21,d22,d23}, [%[dst]]! \n"
1028 " vst1.16 {d24,d25,d26,d27}, [%[dst]]! \n"
1029 " vst1.16 {d28,d29,d30,d31}, [%[dst]]! \n"
1031 " cmp %[count], #0 \n"
1032 " beq 7f @ aligned fastpath \n"
1033 "1: @ four quadwords \n"
1034 " tst %[count], #32 \n"
1035 " beq 2f @ skip oversized fragment \n"
1036 " pld [%[src], %[src_stride], LSL #1] \n" // preload from next scanline
1037 " vld1.16 {d16,d17,d18,d19}, [%[src]]! \n"
1038 " vld1.16 {d20,d21,d22,d23}, [%[src]]! \n"
1039 " vst1.16 {d16,d17,d18,d19}, [%[dst]]! \n"
1040 " vst1.16 {d20,d21,d22,d23}, [%[dst]]! \n"
1041 "2: @ two quadwords \n"
1042 " tst %[count], #16 \n"
1043 " beq 3f @ skip oversized fragment \n"
1044 " pld [%[src], %[src_stride], LSL #1] \n" // preload from next scanline
1045 " vld1.16 {d16,d17,d18,d19}, [%[src]]! \n"
1046 " vst1.16 {d16,d17,d18,d19}, [%[dst]]! \n"
1047 "3: @ one quadword \n"
1048 " tst %[count], #8 \n"
1049 " beq 4f @ skip oversized fragment \n"
1050 " vld1.16 {d16,d17}, [%[src]]! \n"
1051 " vst1.16 {d16,d17}, [%[dst]]! \n"
1052 "4: @ one doubleword \n"
1053 " tst %[count], #4 \n"
1054 " beq 5f @ skip oversized fragment \n"
1055 " vld1.16 {d16}, [%[src]]! \n"
1056 " vst1.16 {d16}, [%[dst]]! \n"
1058 " tst %[count], #2 \n"
1059 " beq 6f @ skip oversized fragment \n"
1060 " ldr %[tmp], [%[src]], #4 \n"
1061 " str %[tmp], [%[dst]], #4 \n"
1062 "6: @ one halfword \n"
1063 " tst %[count], #1 \n"
1064 " beq 7f @ skip oversized fragment \n"
1065 " ldrh %[tmp], [%[src]] \n"
1066 " strh %[tmp], [%[dst]] \n"
1069 // Clobbered input registers marked as input/outputs
1070 : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr), [count] "+r" (count), [tmp] "+r" (tmp)
1072 // Unclobbered input
1073 : [src_stride] "r" (src_stride)
1075 // Clobbered vector registers
1076 // NB: these are the quad aliases of the double registers used in the asm
1077 : "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory"
1080 src_line += src_stride;
1081 dst_line += dst_stride;
1085 #endif /* USE_GCC_INLINE_ASM */
1088 neon_composite_src_24_16 (
1089 pixman_implementation_t * impl,
1091 pixman_image_t * src_image,
1092 pixman_image_t * mask_image,
1093 pixman_image_t * dst_image,
1105 uint32_t dst_stride, src_stride;
1107 if(!width || !height)
1110 /* We simply copy pixels from one place to another, assuming that the source's alpha is opaque. */
1111 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1112 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1114 /* Preload the first input scanline */
1116 uint8_t *src_ptr = (uint8_t*) src_line;
1117 uint32_t count = (width + 15) / 16;
1119 #ifdef USE_GCC_INLINE_ASM
1122 " subs %[count], %[count], #1 \n"
1124 " add %[src], %[src], #64 \n"
1127 // Clobbered input registers marked as input/outputs
1128 : [src] "+r" (src_ptr), [count] "+r" (count)
1129 : // no unclobbered inputs
1141 uint16_t *dst_ptr = dst_line;
1142 uint32_t *src_ptr = src_line;
1143 uint32_t count = width;
1144 const uint32_t rb_mask = 0x1F;
1145 const uint32_t g_mask = 0x3F;
1147 // If you're going to complain about a goto, take a long hard look
1148 // at the massive blocks of assembler this skips over. ;-)
1152 #ifdef USE_GCC_INLINE_ASM
1154 // This is not as aggressive as the RGB565-source case.
1155 // Generally the source is in cached RAM when the formats are different, so we use preload.
1156 // We don't need to blend, so we are not reading from the uncached framebuffer.
1158 " cmp %[count], #16 \n"
1159 " blt 1f @ skip oversized fragments \n"
1160 "0: @ start with sixteen pixels at a time \n"
1161 " sub %[count], %[count], #16 \n"
1162 " pld [%[src], %[src_stride], lsl #2] @ preload from next scanline \n"
1163 " vld4.8 {d0,d1,d2,d3}, [%[src]]! @ d3 is alpha and ignored, d2-0 are rgb. \n"
1164 " vld4.8 {d4,d5,d6,d7}, [%[src]]! @ d7 is alpha and ignored, d6-4 are rgb. \n"
1165 " vshll.u8 q8, d2, #8 @ expand first red for repacking \n"
1166 " vshll.u8 q10, d1, #8 @ expand first green for repacking \n"
1167 " vshll.u8 q11, d0, #8 @ expand first blue for repacking \n"
1168 " vshll.u8 q9, d6, #8 @ expand second red for repacking \n"
1169 " vsri.u16 q8, q10, #5 @ insert first green after red \n"
1170 " vshll.u8 q10, d5, #8 @ expand second green for repacking \n"
1171 " vsri.u16 q8, q11, #11 @ insert first blue after green \n"
1172 " vshll.u8 q11, d4, #8 @ expand second blue for repacking \n"
1173 " vsri.u16 q9, q10, #5 @ insert second green after red \n"
1174 " vsri.u16 q9, q11, #11 @ insert second blue after green \n"
1175 " cmp %[count], #16 \n"
1176 " vst1.16 {d16,d17,d18,d19}, [%[dst]]! @ store 16 pixels \n"
1178 "1: @ end of main loop \n"
1179 " cmp %[count], #8 @ can we still do an 8-pixel block? \n"
1181 " sub %[count], %[count], #8 \n"
1182 " pld [%[src], %[src_stride], lsl #2] @ preload from next scanline \n"
1183 " vld4.8 {d0,d1,d2,d3}, [%[src]]! @ d3 is alpha and ignored, d2-0 are rgb. \n"
1184 " vshll.u8 q8, d2, #8 @ expand first red for repacking \n"
1185 " vshll.u8 q10, d1, #8 @ expand first green for repacking \n"
1186 " vshll.u8 q11, d0, #8 @ expand first blue for repacking \n"
1187 " vsri.u16 q8, q10, #5 @ insert first green after red \n"
1188 " vsri.u16 q8, q11, #11 @ insert first blue after green \n"
1189 " vst1.16 {d16,d17}, [%[dst]]! @ store 8 pixels \n"
1192 // Clobbered input and working registers marked as input/outputs
1193 : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr), [count] "+r" (count)
1195 // Unclobbered input
1196 : [src_stride] "r" (src_stride)
1198 // Clobbered vector registers
1199 // NB: these are the quad aliases of the double registers used in the asm
1200 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "cc", "memory"
1203 // A copy of the above code, in intrinsics-form.
1204 // This should be pretty self-documenting...
1205 while(count >= 16) {
1206 uint8x8x4_t pixel_set_a, pixel_set_b;
1207 uint16x8_t red_a, green_a, blue_a;
1208 uint16x8_t red_b, green_b, blue_b;
1209 uint16x8_t dest_pixels_a, dest_pixels_b;
1212 __pld(src_ptr + src_stride);
1213 pixel_set_a = vld4_u8((uint8_t*)(src_ptr));
1214 pixel_set_b = vld4_u8((uint8_t*)(src_ptr+8));
1217 red_a = vshll_n_u8(pixel_set_a.val[2], 8);
1218 green_a = vshll_n_u8(pixel_set_a.val[1], 8);
1219 blue_a = vshll_n_u8(pixel_set_a.val[0], 8);
1220 red_b = vshll_n_u8(pixel_set_b.val[2], 8);
1221 green_b = vshll_n_u8(pixel_set_b.val[1], 8);
1222 blue_b = vshll_n_u8(pixel_set_b.val[0], 8);
1223 dest_pixels_a = vsriq_n_u16(red_a, green_a, 5);
1224 dest_pixels_b = vsriq_n_u16(red_b, green_b, 5);
1225 dest_pixels_a = vsriq_n_u16(dest_pixels_a, blue_a, 11);
1226 dest_pixels_b = vsriq_n_u16(dest_pixels_b, blue_b, 11);
1228 // There doesn't seem to be an intrinsic for the double-quadword variant
1229 vst1q_u16(dst_ptr , dest_pixels_a);
1230 vst1q_u16(dst_ptr+8, dest_pixels_b);
1236 uint8x8x4_t pixel_set_a;
1237 uint16x8_t red_a, green_a, blue_a;
1238 uint16x8_t dest_pixels_a;
1240 __pld(src_ptr + src_stride);
1242 pixel_set_a = vld4_u8((uint8_t*)(src_ptr));
1245 red_a = vshll_n_u8(pixel_set_a.val[2], 8);
1246 green_a = vshll_n_u8(pixel_set_a.val[1], 8);
1247 blue_a = vshll_n_u8(pixel_set_a.val[0], 8);
1248 dest_pixels_a = vsriq_n_u16(red_a, green_a, 5);
1249 dest_pixels_a = vsriq_n_u16(dest_pixels_a, blue_a, 11);
1251 vst1q_u16(dst_ptr , dest_pixels_a);
1255 #endif // USE_GCC_INLINE_ASM
1260 __pld(src_ptr + src_stride);
1263 uint32_t src_pixel_a = *src_ptr++;
1264 uint32_t src_pixel_b = *src_ptr++;
1266 // ARM is really good at shift-then-ALU ops.
1267 // This should be a total of six shift-ANDs and five shift-ORs.
1268 uint32_t dst_pixels_a;
1269 uint32_t dst_pixels_b;
1271 dst_pixels_a = ((src_pixel_a >> 3) & rb_mask);
1272 dst_pixels_a |= ((src_pixel_a >> 10) & g_mask) << 5;
1273 dst_pixels_a |= ((src_pixel_a >> 19) & rb_mask) << 11;
1275 dst_pixels_b = ((src_pixel_b >> 3) & rb_mask);
1276 dst_pixels_b |= ((src_pixel_b >> 10) & g_mask) << 5;
1277 dst_pixels_b |= ((src_pixel_b >> 19) & rb_mask) << 11;
1279 // little-endian mode only
1280 *((uint32_t*) dst_ptr) = dst_pixels_a | (dst_pixels_b << 16);
1286 uint32_t src_pixel = *src_ptr++;
1288 // ARM is really good at shift-then-ALU ops.
1289 // This block should end up as three shift-ANDs and two shift-ORs.
1290 uint32_t tmp_blue = (src_pixel >> 3) & rb_mask;
1291 uint32_t tmp_green = (src_pixel >> 10) & g_mask;
1292 uint32_t tmp_red = (src_pixel >> 19) & rb_mask;
1293 uint16_t dst_pixel = (tmp_red << 11) | (tmp_green << 5) | tmp_blue;
1295 *dst_ptr++ = dst_pixel;
1299 src_line += src_stride;
1300 dst_line += dst_stride;
1305 static pixman_bool_t
1306 pixman_fill_neon (uint32_t *bits,
1315 uint32_t byte_stride, color;
1318 /* stride is always multiple of 32bit units in pixman */
1319 byte_stride = stride * sizeof(uint32_t);
1324 dst = ((char *) bits) + y * byte_stride + x;
1326 color = _xor << 24 | _xor << 16 | _xor << 8 | _xor;
1329 dst = ((char *) bits) + y * byte_stride + x * 2;
1331 color = _xor << 16 | _xor;
1332 width *= 2; /* width to bytes */
1335 dst = ((char *) bits) + y * byte_stride + x * 4;
1337 width *= 4; /* width to bytes */
1343 #ifdef USE_GCC_INLINE_ASM
1345 /* We have a special case for such small widths that don't allow
1346 us to use wide 128-bit stores anyway. We don't waste time
1347 trying to align writes, since there are only very few of them anyway */
1349 "cmp %[height], #0\n" /* Check if empty fill */
1351 "vdup.32 d0, %[color]\n" /* Fill the color to neon req */
1353 /* Check if we have a such width that can easily be handled by single
1354 operation for each scanline. This significantly reduces the number
1355 of test/branch instructions for each scanline */
1356 "cmp %[width], #8\n"
1358 "cmp %[width], #4\n"
1360 "cmp %[width], #2\n"
1363 /* Loop starts here for each scanline */
1365 "mov r4, %[dst]\n" /* Starting address of the current line */
1366 "tst %[width], #8\n"
1368 "vst1.8 {d0}, [r4]!\n"
1370 "tst %[width], #4\n"
1372 "str %[color], [r4], #4\n"
1374 "tst %[width], #2\n"
1376 "strh %[color], [r4], #2\n"
1378 "tst %[width], #1\n"
1380 "strb %[color], [r4], #1\n"
1383 "subs %[height], %[height], #1\n"
1384 "add %[dst], %[dst], %[byte_stride]\n"
1388 /* Special fillers for those widths that we can do with single operation */
1390 "subs %[height], %[height], #1\n"
1391 "vst1.8 {d0}, [%[dst]]\n"
1392 "add %[dst], %[dst], %[byte_stride]\n"
1397 "subs %[height], %[height], #1\n"
1398 "str %[color], [%[dst]]\n"
1399 "add %[dst], %[dst], %[byte_stride]\n"
1404 "subs %[height], %[height], #1\n"
1405 "strh %[color], [%[dst]]\n"
1406 "add %[dst], %[dst], %[byte_stride]\n"
1410 : /* No output members */
1411 : [color] "r" (color), [height] "r" (height), [width] "r" (width),
1412 [dst] "r" (dst) , [byte_stride] "r" (byte_stride)
1413 : "memory", "cc", "d0", "r4", "r5");
1416 "cmp %[height], #0\n" /* Check if empty fill */
1418 "vdup.32 q0, %[color]\n" /* Fill the color to neon req */
1420 /* Loop starts here for each scanline */
1422 "mov r4, %[dst]\n" /* Starting address of the current line */
1423 "mov r5, %[width]\n" /* We're going to write this many bytes */
1424 "ands r6, r4, #15\n" /* Are we at the 128-bit aligned address? */
1425 "beq 2f\n" /* Jump to the best case */
1427 /* We're not 128-bit aligned: However, we know that we can get to the
1428 next aligned location, since the fill is at least 16 bytes wide */
1429 "rsb r6, r6, #16\n" /* We would need to go forward this much */
1430 "sub r5, r5, r6\n" /* Update bytes left */
1433 "vst1.8 {d0[0]}, [r4]!\n"/* Store byte, now we are word aligned */
1437 "vst1.16 {d0[0]}, [r4, :16]!\n"/* Store half word, now we are 16-bit aligned */
1441 "vst1.32 {d0[0]}, [r4, :32]!\n"/* Store word, now we're 32-bit aligned */
1445 "vst1.64 {d0}, [r4, :64]!\n" /* Store qword now we're 64-bit aligned */
1447 /* The good case: We're 128-bit aligned for this scanline */
1449 "and r6, r5, #15\n" /* Number of tailing bytes */
1450 "cmp r5, r6\n" /* Do we have at least one qword to write? */
1451 "beq 6f\n" /* No, we just write the tail */
1452 "lsr r5, r5, #4\n" /* This many full qwords to write */
1454 /* The main block: Do 128-bit aligned writes */
1457 "vst1.64 {d0,d1}, [r4, :128]!\n"
1460 /* Handle the tailing bytes: Do 64, 32, 16 and 8-bit aligned writes as needed.
1461 We know that we're currently at 128-bit aligned address, so we can just
1462 pick the biggest operations that the remaining write width allows */
1468 "vst1.64 {d0}, [r4, :64]!\n"
1472 "vst1.32 {d0[0]}, [r4, :32]!\n"
1476 "vst1.16 {d0[0]}, [r4, :16]!\n"
1480 "vst1.8 {d0[0]}, [r4]!\n"
1483 /* Handle the next scanline */
1484 "subs %[height], %[height], #1\n"
1485 "add %[dst], %[dst], %[byte_stride]\n"
1488 : /* No output members */
1489 : [color] "r" (color), [height] "r" (height), [width] "r" (width),
1490 [dst] "r" (dst) , [byte_stride] "r" (byte_stride)
1491 : "memory", "cc", "q0", "d0", "d1", "r4", "r5", "r6");
1497 // TODO: intrinsic version for armcc
1504 // TODO: is there a more generic way of doing this being introduced?
1505 #define NEON_SCANLINE_BUFFER_PIXELS (1024)
1507 static inline void neon_quadword_copy(
1510 uint32_t count, // of quadwords
1511 uint32_t trailer_count // of bytes
1514 uint8_t *t_dst = dst, *t_src = src;
1516 // Uses aligned multi-register loads to maximise read bandwidth
1517 // on uncached memory such as framebuffers
1518 // The accesses do not have the aligned qualifiers, so that the copy
1519 // may convert between aligned-uncached and unaligned-cached memory.
1520 // It is assumed that the CPU can infer alignedness from the address.
1522 #ifdef USE_GCC_INLINE_ASM
1525 " cmp %[count], #8 \n"
1526 " blt 1f @ skip oversized fragments \n"
1527 "0: @ start with eight quadwords at a time \n"
1528 " sub %[count], %[count], #8 \n"
1529 " vld1.8 {d16,d17,d18,d19}, [%[src]]! \n"
1530 " vld1.8 {d20,d21,d22,d23}, [%[src]]! \n"
1531 " vld1.8 {d24,d25,d26,d27}, [%[src]]! \n"
1532 " vld1.8 {d28,d29,d30,d31}, [%[src]]! \n"
1533 " cmp %[count], #8 \n"
1534 " vst1.8 {d16,d17,d18,d19}, [%[dst]]! \n"
1535 " vst1.8 {d20,d21,d22,d23}, [%[dst]]! \n"
1536 " vst1.8 {d24,d25,d26,d27}, [%[dst]]! \n"
1537 " vst1.8 {d28,d29,d30,d31}, [%[dst]]! \n"
1539 "1: @ four quadwords \n"
1540 " tst %[count], #4 \n"
1541 " beq 2f @ skip oversized fragment \n"
1542 " vld1.8 {d16,d17,d18,d19}, [%[src]]! \n"
1543 " vld1.8 {d20,d21,d22,d23}, [%[src]]! \n"
1544 " vst1.8 {d16,d17,d18,d19}, [%[dst]]! \n"
1545 " vst1.8 {d20,d21,d22,d23}, [%[dst]]! \n"
1546 "2: @ two quadwords \n"
1547 " tst %[count], #2 \n"
1548 " beq 3f @ skip oversized fragment \n"
1549 " vld1.8 {d16,d17,d18,d19}, [%[src]]! \n"
1550 " vst1.8 {d16,d17,d18,d19}, [%[dst]]! \n"
1551 "3: @ one quadword \n"
1552 " tst %[count], #1 \n"
1553 " beq 4f @ skip oversized fragment \n"
1554 " vld1.8 {d16,d17}, [%[src]]! \n"
1555 " vst1.8 {d16,d17}, [%[dst]]! \n"
1558 // Clobbered input registers marked as input/outputs
1559 : [dst] "+r" (t_dst), [src] "+r" (t_src), [count] "+r" (count)
1561 // No unclobbered inputs
1564 // Clobbered vector registers
1565 // NB: these are the quad aliases of the double registers used in the asm
1566 : "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory"
1572 uint8x16x4_t t1 = vld4q_u8(t_src);
1573 uint8x16x4_t t2 = vld4q_u8(t_src + sizeof(uint8x16x4_t));
1574 t_src += sizeof(uint8x16x4_t) * 2;
1575 vst4q_u8(t_dst, t1);
1576 vst4q_u8(t_dst + sizeof(uint8x16x4_t), t2);
1577 t_dst += sizeof(uint8x16x4_t) * 2;
1582 uint8x16x4_t t1 = vld4q_u8(t_src);
1583 t_src += sizeof(uint8x16x4_t);
1584 vst4q_u8(t_dst, t1);
1585 t_dst += sizeof(uint8x16x4_t);
1589 uint8x8x4_t t1 = vld4_u8(t_src);
1590 t_src += sizeof(uint8x8x4_t);
1592 t_dst += sizeof(uint8x8x4_t);
1596 uint8x16_t t1 = vld1q_u8(t_src);
1597 t_src += sizeof(uint8x16_t);
1598 vst1q_u8(t_dst, t1);
1599 t_dst += sizeof(uint8x16_t);
1602 #endif // !USE_GCC_INLINE_ASM
1605 if(trailer_count & 8) {
1606 uint8x8_t t1 = vld1_u8(t_src);
1607 t_src += sizeof(uint8x8_t);
1609 t_dst += sizeof(uint8x8_t);
1612 if(trailer_count & 4) {
1613 *((uint32_t*) t_dst) = *((uint32_t*) t_src);
1618 if(trailer_count & 2) {
1619 *((uint16_t*) t_dst) = *((uint16_t*) t_src);
1624 if(trailer_count & 1) {
1625 *t_dst++ = *t_src++;
1630 static inline void solid_over_565_8_pix_neon(
1631 uint32_t glyph_colour,
1634 uint32_t dest_stride, // bytes, not elements
1635 uint32_t mask_stride,
1636 uint32_t count // 8-pixel groups
1639 // Inner loop of glyph blitter (solid colour, alpha mask)
1641 #ifdef USE_GCC_INLINE_ASM
1644 " vld4.8 {d20[],d21[],d22[],d23[]}, [%[glyph_colour]] @ splat solid colour components \n"
1646 " vld1.16 {d0,d1}, [%[dest]] @ load first pixels from framebuffer \n"
1647 " vld1.8 {d17}, [%[in_mask]] @ load alpha mask of glyph \n"
1648 " vmull.u8 q9, d17, d23 @ apply glyph colour alpha to mask \n"
1649 " vshrn.u16 d17, q9, #8 @ reformat it to match original mask \n"
1650 " vmvn d18, d17 @ we need the inverse mask for the background \n"
1651 " vsli.u16 q3, q0, #5 @ duplicate framebuffer blue bits \n"
1652 " vshrn.u16 d2, q0, #8 @ unpack red from framebuffer pixels \n"
1653 " vshrn.u16 d4, q0, #3 @ unpack green \n"
1654 " vsri.u8 d2, d2, #5 @ duplicate red bits (extend 5 to 8) \n"
1655 " vshrn.u16 d6, q3, #2 @ unpack extended blue (truncate 10 to 8) \n"
1656 " vsri.u8 d4, d4, #6 @ duplicate green bits (extend 6 to 8) \n"
1657 " vmull.u8 q1, d2, d18 @ apply inverse mask to background red... \n"
1658 " vmull.u8 q2, d4, d18 @ ...green... \n"
1659 " vmull.u8 q3, d6, d18 @ ...blue \n"
1660 " subs %[count], %[count], #1 @ decrement/test loop counter \n"
1661 " vmlal.u8 q1, d17, d22 @ add masked foreground red... \n"
1662 " vmlal.u8 q2, d17, d21 @ ...green... \n"
1663 " vmlal.u8 q3, d17, d20 @ ...blue \n"
1664 " add %[in_mask], %[in_mask], %[mask_stride] @ advance mask pointer, while we wait \n"
1665 " vsri.16 q1, q2, #5 @ pack green behind red \n"
1666 " vsri.16 q1, q3, #11 @ pack blue into pixels \n"
1667 " vst1.16 {d2,d3}, [%[dest]] @ store composited pixels \n"
1668 " add %[dest], %[dest], %[dest_stride] @ advance framebuffer pointer \n"
1669 " bne 0b @ next please \n"
1671 // Clobbered registers marked as input/outputs
1672 : [dest] "+r" (dest), [in_mask] "+r" (in_mask), [count] "+r" (count)
1675 : [dest_stride] "r" (dest_stride), [mask_stride] "r" (mask_stride), [glyph_colour] "r" (&glyph_colour)
1677 // Clobbers, including the inputs we modify, and potentially lots of memory
1678 : "q0", "q1", "q2", "q3", "d17", "q9", "q10", "q11", "q12", "cc", "memory"
1683 uint8x8x4_t solid_colour = vld4_dup_u8((uint8_t*) &glyph_colour);
1687 uint16x8_t pixels = vld1q_u16(dest);
1688 uint8x8_t mask = vshrn_n_u16(vmull_u8(solid_colour.val[3], vld1_u8(in_mask)), 8);
1689 uint8x8_t mask_image = vmvn_u8(mask);
1691 uint8x8_t t_red = vshrn_n_u16(pixels, 8);
1692 uint8x8_t t_green = vshrn_n_u16(pixels, 3);
1693 uint8x8_t t_blue = vshrn_n_u16(vsli_n_u8(pixels, pixels, 5), 2);
1695 uint16x8_t s_red = vmull_u8(vsri_n_u8(t_red , t_red , 5), mask_image);
1696 uint16x8_t s_green = vmull_u8(vsri_n_u8(t_green, t_green, 6), mask_image);
1697 uint16x8_t s_blue = vmull_u8( t_blue , mask_image);
1699 s_red = vmlal(s_red , mask, solid_colour.val[2]);
1700 s_green = vmlal(s_green, mask, solid_colour.val[1]);
1701 s_blue = vmlal(s_blue , mask, solid_colour.val[0]);
1703 pixels = vsri_n_u16(s_red, s_green, 5);
1704 pixels = vsri_n_u16(pixels, s_blue, 11);
1705 vst1q_u16(dest, pixels);
1707 dest += dest_stride;
1708 mask += mask_stride;
1715 neon_composite_over_n_8_0565 (
1716 pixman_implementation_t * impl,
1718 pixman_image_t * src_image,
1719 pixman_image_t * mask_image,
1720 pixman_image_t * dst_image,
1731 uint16_t *dst_line, *aligned_line;
1733 uint32_t dst_stride, mask_stride;
1734 uint32_t kernel_count, copy_count, copy_tail;
1735 uint8_t kernel_offset, copy_offset;
1737 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
1739 // bail out if fully transparent or degenerate
1743 if(width == 0 || height == 0)
1746 if(width > NEON_SCANLINE_BUFFER_PIXELS) {
1747 // split the blit, so we can use a fixed-size scanline buffer
1748 // TODO: there must be a more elegant way of doing this.
1750 for(x=0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS) {
1751 neon_composite_over_n_8_0565(impl, op, src_image, mask_image, dst_image, src_x+x, src_y, mask_x+x, mask_y, dest_x+x, dest_y,
1752 (x+NEON_SCANLINE_BUFFER_PIXELS > width) ? width-x : NEON_SCANLINE_BUFFER_PIXELS, height);
1757 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1758 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1760 // keep within minimum number of aligned quadwords on width
1761 // while also keeping the minimum number of columns to process
1763 unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
1764 unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
1765 unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
1767 // the fast copy should be quadword aligned
1768 copy_offset = dst_line - ((uint16_t*) aligned_left);
1769 aligned_line = dst_line - copy_offset;
1770 copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
1773 if(aligned_right - aligned_left > ceiling_length) {
1774 // unaligned routine is tightest
1775 kernel_count = (uint32_t) (ceiling_length >> 4);
1776 kernel_offset = copy_offset;
1778 // aligned routine is equally tight, so it is safer to align
1779 kernel_count = copy_count;
1783 // We should avoid reading beyond scanline ends for safety
1784 if(aligned_line < (dst_line - x_dst) ||
1785 (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - x_dst) + p_dst->bits.width))
1787 // switch to precise read
1788 copy_offset = kernel_offset = 0;
1789 aligned_line = dst_line;
1790 kernel_count = (uint32_t) (ceiling_length >> 4);
1791 copy_count = (width * sizeof(*dst_line)) >> 4;
1792 copy_tail = (width * sizeof(*dst_line)) & 0xF;
1797 uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8]; // deliberately not initialised
1798 uint8_t glyph_line[NEON_SCANLINE_BUFFER_PIXELS + 8];
1802 // left edge, middle block, right edge
1803 for( ; y--; mask_line += mask_stride, aligned_line += dst_stride, dst_line += dst_stride) {
1804 // We don't want to overrun the edges of the glyph, so realign the edge data into known buffers
1805 neon_quadword_copy(glyph_line + copy_offset, mask_line, width >> 4, width & 0xF);
1807 // Uncached framebuffer access is really, really slow if we do it piecemeal.
1808 // It should be much faster if we grab it all at once.
1809 // One scanline should easily fit in L1 cache, so this should not waste RAM bandwidth.
1810 neon_quadword_copy(scan_line, aligned_line, copy_count, copy_tail);
1812 // Apply the actual filter
1813 solid_over_565_8_pix_neon(src, scan_line + kernel_offset, glyph_line + kernel_offset, 8 * sizeof(*dst_line), 8, kernel_count);
1815 // Copy the modified scanline back
1816 neon_quadword_copy(dst_line, scan_line + copy_offset, width >> 3, (width & 7) * 2);
1821 #ifdef USE_GCC_INLINE_ASM
1823 static inline void plain_over_565_8_pix_neon(
1826 uint32_t dest_stride, // bytes, not elements
1827 uint32_t count // 8-pixel groups
1830 // Inner loop for plain translucent rects (solid colour without alpha mask)
1832 " vld4.8 {d20[],d21[],d22[],d23[]}, [%[colour]] @ solid colour load/splat \n"
1833 " vmull.u8 q12, d23, d22 @ premultiply alpha red \n"
1834 " vmull.u8 q13, d23, d21 @ premultiply alpha green \n"
1835 " vmull.u8 q14, d23, d20 @ premultiply alpha blue \n"
1836 " vmvn d18, d23 @ inverse alpha for background \n"
1838 " vld1.16 {d0,d1}, [%[dest]] @ load first pixels from framebuffer \n"
1839 " vshrn.u16 d2, q0, #8 @ unpack red from framebuffer pixels \n"
1840 " vshrn.u16 d4, q0, #3 @ unpack green \n"
1841 " vsli.u16 q3, q0, #5 @ duplicate framebuffer blue bits \n"
1842 " vsri.u8 d2, d2, #5 @ duplicate red bits (extend 5 to 8) \n"
1843 " vsri.u8 d4, d4, #6 @ duplicate green bits (extend 6 to 8) \n"
1844 " vshrn.u16 d6, q3, #2 @ unpack extended blue (truncate 10 to 8) \n"
1845 " vmov q0, q12 @ retrieve foreground red \n"
1846 " vmlal.u8 q0, d2, d18 @ blend red - my kingdom for a four-operand MLA \n"
1847 " vmov q1, q13 @ retrieve foreground green \n"
1848 " vmlal.u8 q1, d4, d18 @ blend green \n"
1849 " vmov q2, q14 @ retrieve foreground blue \n"
1850 " vmlal.u8 q2, d6, d18 @ blend blue \n"
1851 " subs %[count], %[count], #1 @ decrement/test loop counter \n"
1852 " vsri.16 q0, q1, #5 @ pack green behind red \n"
1853 " vsri.16 q0, q2, #11 @ pack blue into pixels \n"
1854 " vst1.16 {d0,d1}, [%[dest]] @ store composited pixels \n"
1855 " add %[dest], %[dest], %[dest_stride] @ advance framebuffer pointer \n"
1856 " bne 0b @ next please \n"
1858 // Clobbered registers marked as input/outputs
1859 : [dest] "+r" (dest), [count] "+r" (count)
1862 : [dest_stride] "r" (dest_stride), [colour] "r" (&colour)
1864 // Clobbers, including the inputs we modify, and potentially lots of memory
1865 : "q0", "q1", "q2", "q3", "q9", "q10", "q11", "q12", "q13", "q14", "cc", "memory"
1870 neon_composite_over_n_0565 (
1871 pixman_implementation_t * impl,
1873 pixman_image_t * src_image,
1874 pixman_image_t * mask_image,
1875 pixman_image_t * dst_image,
1886 uint16_t *dst_line, *aligned_line;
1887 uint32_t dst_stride;
1888 uint32_t kernel_count, copy_count, copy_tail;
1889 uint8_t kernel_offset, copy_offset;
1891 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
1893 // bail out if fully transparent
1897 if(width == 0 || height == 0)
1900 if(width > NEON_SCANLINE_BUFFER_PIXELS) {
1901 // split the blit, so we can use a fixed-size scanline buffer
1902 // TODO: there must be a more elegant way of doing this.
1904 for(x=0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS) {
1905 neon_composite_over_n_0565(impl, op, src_image, mask_image, dst_image, src_x+x, src_y, mask_x+x, mask_y, dest_x+x, dest_y,
1906 (x+NEON_SCANLINE_BUFFER_PIXELS > width) ? width-x : NEON_SCANLINE_BUFFER_PIXELS, height);
1911 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1913 // keep within minimum number of aligned quadwords on width
1914 // while also keeping the minimum number of columns to process
1916 unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
1917 unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
1918 unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
1920 // the fast copy should be quadword aligned
1921 copy_offset = dst_line - ((uint16_t*) aligned_left);
1922 aligned_line = dst_line - copy_offset;
1923 copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
1926 if(aligned_right - aligned_left > ceiling_length) {
1927 // unaligned routine is tightest
1928 kernel_count = (uint32_t) (ceiling_length >> 4);
1929 kernel_offset = copy_offset;
1931 // aligned routine is equally tight, so it is safer to align
1932 kernel_count = copy_count;
1936 // We should avoid reading beyond scanline ends for safety
1937 if(aligned_line < (dst_line - x_dst) ||
1938 (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - x_dst) + p_dst->bits.width))
1940 // switch to precise read
1941 copy_offset = kernel_offset = 0;
1942 aligned_line = dst_line;
1943 kernel_count = (uint32_t) (ceiling_length >> 4);
1944 copy_count = (width * sizeof(*dst_line)) >> 4;
1945 copy_tail = (width * sizeof(*dst_line)) & 0xF;
1950 uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8]; // deliberately not initialised
1953 // left edge, middle block, right edge
1954 for( ; height--; aligned_line += dst_stride, dst_line += dst_stride) {
1956 // Uncached framebuffer access is really, really slow if we do it piecemeal.
1957 // It should be much faster if we grab it all at once.
1958 // One scanline should easily fit in L1 cache, so this should not waste RAM bandwidth.
1959 neon_quadword_copy(scan_line, aligned_line, copy_count, copy_tail);
1961 // Apply the actual filter
1962 plain_over_565_8_pix_neon(src, scan_line + kernel_offset, 8 * sizeof(*dst_line), kernel_count);
1964 // Copy the modified scanline back
1965 neon_quadword_copy(dst_line, scan_line + copy_offset, width >> 3, (width & 7) * 2);
1970 static inline void ARGB8_over_565_8_pix_neon(
1973 uint32_t src_stride, // bytes, not elements
1974 uint32_t count // 8-pixel groups
1979 " pld [%[src], %[src_stride]] @ preload from next scanline \n"
1980 " vld1.16 {d0,d1}, [%[dest]] @ load pixels from framebuffer \n"
1981 " vld4.8 {d20,d21,d22,d23},[%[src]]! @ load source image pixels \n"
1982 " vsli.u16 q3, q0, #5 @ duplicate framebuffer blue bits \n"
1983 " vshrn.u16 d2, q0, #8 @ unpack red from framebuffer pixels \n"
1984 " vshrn.u16 d4, q0, #3 @ unpack green \n"
1985 " vmvn d18, d23 @ we need the inverse alpha for the background \n"
1986 " vsri.u8 d2, d2, #5 @ duplicate red bits (extend 5 to 8) \n"
1987 " vshrn.u16 d6, q3, #2 @ unpack extended blue (truncate 10 to 8) \n"
1988 " vsri.u8 d4, d4, #6 @ duplicate green bits (extend 6 to 8) \n"
1989 " vmull.u8 q1, d2, d18 @ apply inverse alpha to background red... \n"
1990 " vmull.u8 q2, d4, d18 @ ...green... \n"
1991 " vmull.u8 q3, d6, d18 @ ...blue \n"
1992 " subs %[count], %[count], #1 @ decrement/test loop counter \n"
1993 " vmlal.u8 q1, d23, d22 @ add blended foreground red... \n"
1994 " vmlal.u8 q2, d23, d21 @ ...green... \n"
1995 " vmlal.u8 q3, d23, d20 @ ...blue \n"
1996 " vsri.16 q1, q2, #5 @ pack green behind red \n"
1997 " vsri.16 q1, q3, #11 @ pack blue into pixels \n"
1998 " vst1.16 {d2,d3}, [%[dest]]! @ store composited pixels \n"
1999 " bne 0b @ next please \n"
2001 // Clobbered registers marked as input/outputs
2002 : [dest] "+r" (dest), [src] "+r" (src), [count] "+r" (count)
2005 : [src_stride] "r" (src_stride)
2007 // Clobbers, including the inputs we modify, and potentially lots of memory
2008 : "q0", "q1", "q2", "q3", "d17", "d18", "q10", "q11", "cc", "memory"
2013 neon_composite_over_8888_0565 (
2014 pixman_implementation_t * impl,
2016 pixman_image_t * src_image,
2017 pixman_image_t * mask_image,
2018 pixman_image_t * dst_image,
2029 uint16_t *dst_line, *aligned_line;
2030 uint32_t dst_stride, src_stride;
2031 uint32_t kernel_count, copy_count, copy_tail;
2032 uint8_t kernel_offset, copy_offset;
2034 // we assume mask is opaque
2035 // so the only alpha to deal with is embedded in src
2037 if(width > NEON_SCANLINE_BUFFER_PIXELS) {
2038 // split the blit, so we can use a fixed-size scanline buffer
2040 for(x=0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS) {
2041 neon_composite_over_8888_0565(impl, op, src_image, mask_image, dst_image, src_x+x, src_y, mask_x+x, mask_y, dest_x+x, dest_y,
2042 (x+NEON_SCANLINE_BUFFER_PIXELS > width) ? width-x : NEON_SCANLINE_BUFFER_PIXELS, height);
2047 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2048 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2050 // keep within minimum number of aligned quadwords on width
2051 // while also keeping the minimum number of columns to process
2053 unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
2054 unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
2055 unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
2057 // the fast copy should be quadword aligned
2058 copy_offset = dst_line - ((uint16_t*) aligned_left);
2059 aligned_line = dst_line - copy_offset;
2060 copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
2063 if(aligned_right - aligned_left > ceiling_length) {
2064 // unaligned routine is tightest
2065 kernel_count = (uint32_t) (ceiling_length >> 4);
2066 kernel_offset = copy_offset;
2068 // aligned routine is equally tight, so it is safer to align
2069 kernel_count = copy_count;
2073 // We should avoid reading beyond scanline ends for safety
2074 if(aligned_line < (dst_line - x_dst) ||
2075 (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - x_dst) + p_dst->bits.width))
2077 // switch to precise read
2078 copy_offset = kernel_offset = 0;
2079 aligned_line = dst_line;
2080 kernel_count = (uint32_t) (ceiling_length >> 4);
2081 copy_count = (width * sizeof(*dst_line)) >> 4;
2082 copy_tail = (width * sizeof(*dst_line)) & 0xF;
2086 /* Preload the first input scanline */
2088 uint8_t *src_ptr = (uint8_t*) src_line;
2089 uint32_t count = (width + 15) / 16;
2091 #ifdef USE_GCC_INLINE_ASM
2094 " subs %[count], %[count], #1 \n"
2096 " add %[src], %[src], #64 \n"
2099 // Clobbered input registers marked as input/outputs
2100 : [src] "+r" (src_ptr), [count] "+r" (count)
2101 : // no unclobbered inputs
2113 uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8]; // deliberately not initialised
2116 // left edge, middle block, right edge
2117 for( ; height--; src_line += src_stride, aligned_line += dst_stride) {
2118 // Uncached framebuffer access is really, really slow if we do it piecemeal.
2119 // It should be much faster if we grab it all at once.
2120 // One scanline should easily fit in L1 cache, so this should not waste RAM bandwidth.
2121 neon_quadword_copy(scan_line, aligned_line, copy_count, copy_tail);
2123 // Apply the actual filter
2124 ARGB8_over_565_8_pix_neon(src_line, scan_line + kernel_offset, src_stride * sizeof(*src_line), kernel_count);
2126 // Copy the modified scanline back
2127 neon_quadword_copy(dst_line, scan_line + copy_offset, width >> 3, (width & 7) * 2);
2132 #endif // USE_GCC_INLINE_ASM
2134 static const pixman_fast_path_t arm_neon_fast_path_array[] =
2136 { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, neon_composite_add_8888_8_8, 0 },
2137 { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, neon_composite_add_8000_8000, 0 },
2138 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, neon_composite_over_n_8_0565, 0 },
2139 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, neon_composite_over_n_8_0565, 0 },
2140 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_src_24_16, 0 },
2141 { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_src_24_16, 0 },
2142 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_src_24_16, 0 },
2143 { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_src_24_16, 0 },
2144 #ifdef USE_GCC_INLINE_ASM
2145 { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_src_16_16, 0 },
2146 { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_src_16_16, 0 },
2147 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_over_n_0565, 0 },
2148 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_over_n_0565, 0 },
2149 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, neon_composite_over_8888_0565, 0 },
2150 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, neon_composite_over_8888_0565, 0 },
2152 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, neon_composite_over_8888_8888, 0 },
2153 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, neon_composite_over_8888_8888, 0 },
2154 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, neon_composite_over_8888_8888, 0 },
2155 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, neon_composite_over_8888_8888, 0 },
2156 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
2157 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
2158 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, neon_composite_over_n_8_8888, 0 },
2159 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, neon_composite_over_n_8_8888, 0 },
2160 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, neon_composite_over_n_8_8888, 0 },
2161 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, neon_composite_over_n_8_8888, 0 },
2165 const pixman_fast_path_t *const arm_neon_fast_paths = arm_neon_fast_path_array;
2168 arm_neon_composite (pixman_implementation_t *imp,
2170 pixman_image_t *src,
2171 pixman_image_t *mask,
2172 pixman_image_t *dest,
2182 if (_pixman_run_fast_path (arm_neon_fast_paths, imp,
2183 op, src, mask, dest,
2192 _pixman_implementation_composite (imp->delegate, op,
2200 static pixman_bool_t
2208 int src_x, int src_y,
2209 int dst_x, int dst_y,
2210 int width, int height)
2212 if(!width || !height)
2215 // accelerate only straight copies involving complete bytes
2216 if(src_bpp != dst_bpp || (src_bpp & 7))
2220 uint32_t bytes_per_pixel = src_bpp >> 3;
2221 uint32_t byte_width = width * bytes_per_pixel;
2222 int32_t src_stride_bytes = src_stride * 4; // parameter is in words for some reason
2223 int32_t dst_stride_bytes = dst_stride * 4;
2224 uint8_t *src_bytes = ((uint8_t*) src_bits) + src_y * src_stride_bytes + src_x * bytes_per_pixel;
2225 uint8_t *dst_bytes = ((uint8_t*) dst_bits) + dst_y * dst_stride_bytes + dst_x * bytes_per_pixel;
2226 uint32_t quadword_count = byte_width / 16;
2227 uint32_t offset = byte_width % 16;
2230 neon_quadword_copy(dst_bytes, src_bytes, quadword_count, offset);
2231 src_bytes += src_stride_bytes;
2232 dst_bytes += dst_stride_bytes;
2239 static pixman_bool_t
2240 arm_neon_blt (pixman_implementation_t *imp,
2247 int src_x, int src_y,
2248 int dst_x, int dst_y,
2249 int width, int height)
2251 if (pixman_blt_neon (
2252 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
2253 src_x, src_y, dst_x, dst_y, width, height))
2256 return _pixman_implementation_blt (
2258 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
2259 src_x, src_y, dst_x, dst_y, width, height);
2262 static pixman_bool_t
2263 arm_neon_fill (pixman_implementation_t *imp,
2273 if (pixman_fill_neon (bits, stride, bpp, x, y, width, height, xor))
2276 return _pixman_implementation_fill (
2277 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
2280 pixman_implementation_t *
2281 _pixman_implementation_create_arm_neon (void)
2283 pixman_implementation_t *simd = _pixman_implementation_create_arm_simd();
2284 pixman_implementation_t *imp = _pixman_implementation_create (simd);
2286 imp->composite = arm_neon_composite;
2287 imp->blt = arm_neon_blt;
2288 imp->fill = arm_neon_fill;