2 * Copyright © 2007 Luca Barbato
4 * Permission to use, copy, modify, distribute, and sell this software and its
5 * documentation for any purpose is hereby granted without fee, provided that
6 * the above copyright notice appear in all copies and that both that
7 * copyright notice and this permission notice appear in supporting
8 * documentation, and that the name of Luca Barbato not be used in advertising or
9 * publicity pertaining to distribution of the software without specific,
10 * written prior permission. Luca Barbato makes no representations about the
11 * suitability of this software for any purpose. It is provided "as is"
12 * without express or implied warranty.
14 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
15 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
16 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
17 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
19 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
20 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23 * Author: Luca Barbato (lu_zero@gentoo.org)
25 * Based on fbmmx.c by Owen Taylor, Søren Sandmann and Nicholas Miell
29 #include "pixman-private.h"
30 #include "pixman-combine32.h"
35 static force_inline vector unsigned int
36 splat_alpha (vector unsigned int pix)
38 return vec_perm (pix, pix,
39 (vector unsigned char)AVV (
40 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04,
41 0x08, 0x08, 0x08, 0x08, 0x0C, 0x0C, 0x0C, 0x0C));
44 static force_inline vector unsigned int
45 pix_multiply (vector unsigned int p, vector unsigned int a)
47 vector unsigned short hi, lo, mod;
50 hi = (vector unsigned short)
51 vec_mergeh ((vector unsigned char)AVV (0),
52 (vector unsigned char)p);
54 mod = (vector unsigned short)
55 vec_mergeh ((vector unsigned char)AVV (0),
56 (vector unsigned char)a);
58 hi = vec_mladd (hi, mod, (vector unsigned short)
59 AVV (0x0080, 0x0080, 0x0080, 0x0080,
60 0x0080, 0x0080, 0x0080, 0x0080));
62 hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8)));
64 hi = vec_sr (hi, vec_splat_u16 (8));
67 lo = (vector unsigned short)
68 vec_mergel ((vector unsigned char)AVV (0),
69 (vector unsigned char)p);
70 mod = (vector unsigned short)
71 vec_mergel ((vector unsigned char)AVV (0),
72 (vector unsigned char)a);
74 lo = vec_mladd (lo, mod, (vector unsigned short)
75 AVV (0x0080, 0x0080, 0x0080, 0x0080,
76 0x0080, 0x0080, 0x0080, 0x0080));
78 lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8)));
80 lo = vec_sr (lo, vec_splat_u16 (8));
82 return (vector unsigned int)vec_packsu (hi, lo);
85 static force_inline vector unsigned int
86 pix_add (vector unsigned int a, vector unsigned int b)
88 return (vector unsigned int)vec_adds ((vector unsigned char)a,
89 (vector unsigned char)b);
92 static force_inline vector unsigned int
93 pix_add_mul (vector unsigned int x,
94 vector unsigned int a,
95 vector unsigned int y,
96 vector unsigned int b)
98 vector unsigned int t1, t2;
100 t1 = pix_multiply (x, a);
101 t2 = pix_multiply (y, b);
103 return pix_add (t1, t2);
106 static force_inline vector unsigned int
107 negate (vector unsigned int src)
109 return vec_nor (src, src);
112 /* dest*~srca + src */
113 static force_inline vector unsigned int
114 over (vector unsigned int src,
115 vector unsigned int srca,
116 vector unsigned int dest)
118 vector unsigned char tmp = (vector unsigned char)
119 pix_multiply (dest, negate (srca));
121 tmp = vec_adds ((vector unsigned char)src, tmp);
122 return (vector unsigned int)tmp;
125 /* in == pix_multiply */
126 #define in_over(src, srca, mask, dest) \
127 over (pix_multiply (src, mask), \
128 pix_multiply (srca, mask), dest)
131 #define COMPUTE_SHIFT_MASK(source) \
132 source ## _mask = vec_lvsl (0, source);
134 #define COMPUTE_SHIFT_MASKS(dest, source) \
135 dest ## _mask = vec_lvsl (0, dest); \
136 source ## _mask = vec_lvsl (0, source); \
137 store_mask = vec_lvsr (0, dest);
139 #define COMPUTE_SHIFT_MASKC(dest, source, mask) \
140 mask ## _mask = vec_lvsl (0, mask); \
141 dest ## _mask = vec_lvsl (0, dest); \
142 source ## _mask = vec_lvsl (0, source); \
143 store_mask = vec_lvsr (0, dest);
145 /* notice you have to declare temp vars...
146 * Note: tmp3 and tmp4 must remain untouched!
149 #define LOAD_VECTORS(dest, source) \
150 tmp1 = (typeof(tmp1))vec_ld (0, source); \
151 tmp2 = (typeof(tmp2))vec_ld (15, source); \
152 tmp3 = (typeof(tmp3))vec_ld (0, dest); \
153 v ## source = (typeof(v ## source)) \
154 vec_perm (tmp1, tmp2, source ## _mask); \
155 tmp4 = (typeof(tmp4))vec_ld (15, dest); \
156 v ## dest = (typeof(v ## dest)) \
157 vec_perm (tmp3, tmp4, dest ## _mask);
159 #define LOAD_VECTORSC(dest, source, mask) \
160 tmp1 = (typeof(tmp1))vec_ld (0, source); \
161 tmp2 = (typeof(tmp2))vec_ld (15, source); \
162 tmp3 = (typeof(tmp3))vec_ld (0, dest); \
163 v ## source = (typeof(v ## source)) \
164 vec_perm (tmp1, tmp2, source ## _mask); \
165 tmp4 = (typeof(tmp4))vec_ld (15, dest); \
166 tmp1 = (typeof(tmp1))vec_ld (0, mask); \
167 v ## dest = (typeof(v ## dest)) \
168 vec_perm (tmp3, tmp4, dest ## _mask); \
169 tmp2 = (typeof(tmp2))vec_ld (15, mask); \
170 v ## mask = (typeof(v ## mask)) \
171 vec_perm (tmp1, tmp2, mask ## _mask);
173 #define LOAD_VECTORSM(dest, source, mask) \
174 LOAD_VECTORSC (dest, source, mask) \
175 v ## source = pix_multiply (v ## source, \
176 splat_alpha (v ## mask));
178 #define STORE_VECTOR(dest) \
179 edges = vec_perm (tmp4, tmp3, dest ## _mask); \
180 tmp3 = vec_perm ((vector unsigned char)v ## dest, edges, store_mask); \
181 tmp1 = vec_perm (edges, (vector unsigned char)v ## dest, store_mask); \
182 vec_st ((vector unsigned int) tmp3, 15, dest); \
183 vec_st ((vector unsigned int) tmp1, 0, dest);
186 vmx_combine_over_u_no_mask (uint32_t * dest,
191 vector unsigned int vdest, vsrc;
192 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
193 dest_mask, src_mask, store_mask;
195 COMPUTE_SHIFT_MASKS (dest, src);
197 /* printf ("%s\n",__PRETTY_FUNCTION__); */
198 for (i = width / 4; i > 0; i--)
201 LOAD_VECTORS (dest, src);
203 vdest = over (vsrc, splat_alpha (vsrc), vdest);
211 for (i = width % 4; --i >= 0;)
214 uint32_t d = dest[i];
215 uint32_t ia = ALPHA_8 (~s);
217 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
224 vmx_combine_over_u_mask (uint32_t * dest,
226 const uint32_t *mask,
230 vector unsigned int vdest, vsrc, vmask;
231 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
232 dest_mask, src_mask, mask_mask, store_mask;
234 COMPUTE_SHIFT_MASKC (dest, src, mask);
236 /* printf ("%s\n",__PRETTY_FUNCTION__); */
237 for (i = width / 4; i > 0; i--)
239 LOAD_VECTORSM (dest, src, mask);
241 vdest = over (vsrc, splat_alpha (vsrc), vdest);
250 for (i = width % 4; --i >= 0;)
252 uint32_t m = ALPHA_8 (mask[i]);
254 uint32_t d = dest[i];
257 UN8x4_MUL_UN8 (s, m);
261 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
267 vmx_combine_over_u (pixman_implementation_t *imp,
270 const uint32_t * src,
271 const uint32_t * mask,
275 vmx_combine_over_u_mask (dest, src, mask, width);
277 vmx_combine_over_u_no_mask (dest, src, width);
281 vmx_combine_over_reverse_u_no_mask (uint32_t * dest,
286 vector unsigned int vdest, vsrc;
287 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
288 dest_mask, src_mask, store_mask;
290 COMPUTE_SHIFT_MASKS (dest, src);
292 /* printf ("%s\n",__PRETTY_FUNCTION__); */
293 for (i = width / 4; i > 0; i--)
296 LOAD_VECTORS (dest, src);
298 vdest = over (vdest, splat_alpha (vdest), vsrc);
306 for (i = width % 4; --i >= 0;)
309 uint32_t d = dest[i];
310 uint32_t ia = ALPHA_8 (~dest[i]);
312 UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
318 vmx_combine_over_reverse_u_mask (uint32_t * dest,
320 const uint32_t *mask,
324 vector unsigned int vdest, vsrc, vmask;
325 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
326 dest_mask, src_mask, mask_mask, store_mask;
328 COMPUTE_SHIFT_MASKC (dest, src, mask);
330 /* printf ("%s\n",__PRETTY_FUNCTION__); */
331 for (i = width / 4; i > 0; i--)
334 LOAD_VECTORSM (dest, src, mask);
336 vdest = over (vdest, splat_alpha (vdest), vsrc);
345 for (i = width % 4; --i >= 0;)
347 uint32_t m = ALPHA_8 (mask[i]);
349 uint32_t d = dest[i];
350 uint32_t ia = ALPHA_8 (~dest[i]);
352 UN8x4_MUL_UN8 (s, m);
354 UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
360 vmx_combine_over_reverse_u (pixman_implementation_t *imp,
363 const uint32_t * src,
364 const uint32_t * mask,
368 vmx_combine_over_reverse_u_mask (dest, src, mask, width);
370 vmx_combine_over_reverse_u_no_mask (dest, src, width);
374 vmx_combine_in_u_no_mask (uint32_t * dest,
379 vector unsigned int vdest, vsrc;
380 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
381 dest_mask, src_mask, store_mask;
383 COMPUTE_SHIFT_MASKS (dest, src);
385 /* printf ("%s\n",__PRETTY_FUNCTION__); */
386 for (i = width / 4; i > 0; i--)
388 LOAD_VECTORS (dest, src);
390 vdest = pix_multiply (vsrc, splat_alpha (vdest));
398 for (i = width % 4; --i >= 0;)
401 uint32_t a = ALPHA_8 (dest[i]);
403 UN8x4_MUL_UN8 (s, a);
409 vmx_combine_in_u_mask (uint32_t * dest,
411 const uint32_t *mask,
415 vector unsigned int vdest, vsrc, vmask;
416 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
417 dest_mask, src_mask, mask_mask, store_mask;
419 COMPUTE_SHIFT_MASKC (dest, src, mask);
421 /* printf ("%s\n",__PRETTY_FUNCTION__); */
422 for (i = width / 4; i > 0; i--)
424 LOAD_VECTORSM (dest, src, mask);
426 vdest = pix_multiply (vsrc, splat_alpha (vdest));
435 for (i = width % 4; --i >= 0;)
437 uint32_t m = ALPHA_8 (mask[i]);
439 uint32_t a = ALPHA_8 (dest[i]);
441 UN8x4_MUL_UN8 (s, m);
442 UN8x4_MUL_UN8 (s, a);
449 vmx_combine_in_u (pixman_implementation_t *imp,
452 const uint32_t * src,
453 const uint32_t * mask,
457 vmx_combine_in_u_mask (dest, src, mask, width);
459 vmx_combine_in_u_no_mask (dest, src, width);
463 vmx_combine_in_reverse_u_no_mask (uint32_t * dest,
468 vector unsigned int vdest, vsrc;
469 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
470 dest_mask, src_mask, store_mask;
472 COMPUTE_SHIFT_MASKS (dest, src);
474 /* printf ("%s\n",__PRETTY_FUNCTION__); */
475 for (i = width / 4; i > 0; i--)
477 LOAD_VECTORS (dest, src);
479 vdest = pix_multiply (vdest, splat_alpha (vsrc));
487 for (i = width % 4; --i >= 0;)
489 uint32_t d = dest[i];
490 uint32_t a = ALPHA_8 (src[i]);
492 UN8x4_MUL_UN8 (d, a);
499 vmx_combine_in_reverse_u_mask (uint32_t * dest,
501 const uint32_t *mask,
505 vector unsigned int vdest, vsrc, vmask;
506 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
507 dest_mask, src_mask, mask_mask, store_mask;
509 COMPUTE_SHIFT_MASKC (dest, src, mask);
511 /* printf ("%s\n",__PRETTY_FUNCTION__); */
512 for (i = width / 4; i > 0; i--)
514 LOAD_VECTORSM (dest, src, mask);
516 vdest = pix_multiply (vdest, splat_alpha (vsrc));
525 for (i = width % 4; --i >= 0;)
527 uint32_t m = ALPHA_8 (mask[i]);
528 uint32_t d = dest[i];
531 UN8x4_MUL_UN8 (a, m);
533 UN8x4_MUL_UN8 (d, a);
540 vmx_combine_in_reverse_u (pixman_implementation_t *imp,
543 const uint32_t * src,
544 const uint32_t * mask,
548 vmx_combine_in_reverse_u_mask (dest, src, mask, width);
550 vmx_combine_in_reverse_u_no_mask (dest, src, width);
554 vmx_combine_out_u_no_mask (uint32_t * dest,
559 vector unsigned int vdest, vsrc;
560 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
561 dest_mask, src_mask, store_mask;
563 COMPUTE_SHIFT_MASKS (dest, src);
565 /* printf ("%s\n",__PRETTY_FUNCTION__); */
566 for (i = width / 4; i > 0; i--)
568 LOAD_VECTORS (dest, src);
570 vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
578 for (i = width % 4; --i >= 0;)
581 uint32_t a = ALPHA_8 (~dest[i]);
583 UN8x4_MUL_UN8 (s, a);
590 vmx_combine_out_u_mask (uint32_t * dest,
592 const uint32_t *mask,
596 vector unsigned int vdest, vsrc, vmask;
597 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
598 dest_mask, src_mask, mask_mask, store_mask;
600 COMPUTE_SHIFT_MASKC (dest, src, mask);
602 /* printf ("%s\n",__PRETTY_FUNCTION__); */
603 for (i = width / 4; i > 0; i--)
605 LOAD_VECTORSM (dest, src, mask);
607 vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
616 for (i = width % 4; --i >= 0;)
618 uint32_t m = ALPHA_8 (mask[i]);
620 uint32_t a = ALPHA_8 (~dest[i]);
622 UN8x4_MUL_UN8 (s, m);
623 UN8x4_MUL_UN8 (s, a);
630 vmx_combine_out_u (pixman_implementation_t *imp,
633 const uint32_t * src,
634 const uint32_t * mask,
638 vmx_combine_out_u_mask (dest, src, mask, width);
640 vmx_combine_out_u_no_mask (dest, src, width);
644 vmx_combine_out_reverse_u_no_mask (uint32_t * dest,
649 vector unsigned int vdest, vsrc;
650 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
651 dest_mask, src_mask, store_mask;
653 COMPUTE_SHIFT_MASKS (dest, src);
655 /* printf ("%s\n",__PRETTY_FUNCTION__); */
656 for (i = width / 4; i > 0; i--)
659 LOAD_VECTORS (dest, src);
661 vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
669 for (i = width % 4; --i >= 0;)
671 uint32_t d = dest[i];
672 uint32_t a = ALPHA_8 (~src[i]);
674 UN8x4_MUL_UN8 (d, a);
681 vmx_combine_out_reverse_u_mask (uint32_t * dest,
683 const uint32_t *mask,
687 vector unsigned int vdest, vsrc, vmask;
688 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
689 dest_mask, src_mask, mask_mask, store_mask;
691 COMPUTE_SHIFT_MASKC (dest, src, mask);
693 /* printf ("%s\n",__PRETTY_FUNCTION__); */
694 for (i = width / 4; i > 0; i--)
696 LOAD_VECTORSM (dest, src, mask);
698 vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
707 for (i = width % 4; --i >= 0;)
709 uint32_t m = ALPHA_8 (mask[i]);
710 uint32_t d = dest[i];
713 UN8x4_MUL_UN8 (a, m);
715 UN8x4_MUL_UN8 (d, a);
722 vmx_combine_out_reverse_u (pixman_implementation_t *imp,
725 const uint32_t * src,
726 const uint32_t * mask,
730 vmx_combine_out_reverse_u_mask (dest, src, mask, width);
732 vmx_combine_out_reverse_u_no_mask (dest, src, width);
736 vmx_combine_atop_u_no_mask (uint32_t * dest,
741 vector unsigned int vdest, vsrc;
742 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
743 dest_mask, src_mask, store_mask;
745 COMPUTE_SHIFT_MASKS (dest, src);
747 /* printf ("%s\n",__PRETTY_FUNCTION__); */
748 for (i = width / 4; i > 0; i--)
750 LOAD_VECTORS (dest, src);
752 vdest = pix_add_mul (vsrc, splat_alpha (vdest),
753 vdest, splat_alpha (negate (vsrc)));
761 for (i = width % 4; --i >= 0;)
764 uint32_t d = dest[i];
765 uint32_t dest_a = ALPHA_8 (d);
766 uint32_t src_ia = ALPHA_8 (~s);
768 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
775 vmx_combine_atop_u_mask (uint32_t * dest,
777 const uint32_t *mask,
781 vector unsigned int vdest, vsrc, vmask;
782 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
783 dest_mask, src_mask, mask_mask, store_mask;
785 COMPUTE_SHIFT_MASKC (dest, src, mask);
787 /* printf ("%s\n",__PRETTY_FUNCTION__); */
788 for (i = width / 4; i > 0; i--)
790 LOAD_VECTORSM (dest, src, mask);
792 vdest = pix_add_mul (vsrc, splat_alpha (vdest),
793 vdest, splat_alpha (negate (vsrc)));
802 for (i = width % 4; --i >= 0;)
804 uint32_t m = ALPHA_8 (mask[i]);
806 uint32_t d = dest[i];
807 uint32_t dest_a = ALPHA_8 (d);
810 UN8x4_MUL_UN8 (s, m);
812 src_ia = ALPHA_8 (~s);
814 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
821 vmx_combine_atop_u (pixman_implementation_t *imp,
824 const uint32_t * src,
825 const uint32_t * mask,
829 vmx_combine_atop_u_mask (dest, src, mask, width);
831 vmx_combine_atop_u_no_mask (dest, src, width);
835 vmx_combine_atop_reverse_u_no_mask (uint32_t * dest,
840 vector unsigned int vdest, vsrc;
841 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
842 dest_mask, src_mask, store_mask;
844 COMPUTE_SHIFT_MASKS (dest, src);
846 /* printf ("%s\n",__PRETTY_FUNCTION__); */
847 for (i = width / 4; i > 0; i--)
849 LOAD_VECTORS (dest, src);
851 vdest = pix_add_mul (vdest, splat_alpha (vsrc),
852 vsrc, splat_alpha (negate (vdest)));
860 for (i = width % 4; --i >= 0;)
863 uint32_t d = dest[i];
864 uint32_t src_a = ALPHA_8 (s);
865 uint32_t dest_ia = ALPHA_8 (~d);
867 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
874 vmx_combine_atop_reverse_u_mask (uint32_t * dest,
876 const uint32_t *mask,
880 vector unsigned int vdest, vsrc, vmask;
881 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
882 dest_mask, src_mask, mask_mask, store_mask;
884 COMPUTE_SHIFT_MASKC (dest, src, mask);
886 /* printf ("%s\n",__PRETTY_FUNCTION__); */
887 for (i = width / 4; i > 0; i--)
889 LOAD_VECTORSM (dest, src, mask);
891 vdest = pix_add_mul (vdest, splat_alpha (vsrc),
892 vsrc, splat_alpha (negate (vdest)));
901 for (i = width % 4; --i >= 0;)
903 uint32_t m = ALPHA_8 (mask[i]);
905 uint32_t d = dest[i];
907 uint32_t dest_ia = ALPHA_8 (~d);
909 UN8x4_MUL_UN8 (s, m);
913 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
920 vmx_combine_atop_reverse_u (pixman_implementation_t *imp,
923 const uint32_t * src,
924 const uint32_t * mask,
928 vmx_combine_atop_reverse_u_mask (dest, src, mask, width);
930 vmx_combine_atop_reverse_u_no_mask (dest, src, width);
934 vmx_combine_xor_u_no_mask (uint32_t * dest,
939 vector unsigned int vdest, vsrc;
940 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
941 dest_mask, src_mask, store_mask;
943 COMPUTE_SHIFT_MASKS (dest, src);
945 /* printf ("%s\n",__PRETTY_FUNCTION__); */
946 for (i = width / 4; i > 0; i--)
948 LOAD_VECTORS (dest, src);
950 vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
951 vdest, splat_alpha (negate (vsrc)));
959 for (i = width % 4; --i >= 0;)
962 uint32_t d = dest[i];
963 uint32_t src_ia = ALPHA_8 (~s);
964 uint32_t dest_ia = ALPHA_8 (~d);
966 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
973 vmx_combine_xor_u_mask (uint32_t * dest,
975 const uint32_t *mask,
979 vector unsigned int vdest, vsrc, vmask;
980 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
981 dest_mask, src_mask, mask_mask, store_mask;
983 COMPUTE_SHIFT_MASKC (dest, src, mask);
985 /* printf ("%s\n",__PRETTY_FUNCTION__); */
986 for (i = width / 4; i > 0; i--)
988 LOAD_VECTORSM (dest, src, mask);
990 vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
991 vdest, splat_alpha (negate (vsrc)));
1000 for (i = width % 4; --i >= 0;)
1002 uint32_t m = ALPHA_8 (mask[i]);
1003 uint32_t s = src[i];
1004 uint32_t d = dest[i];
1006 uint32_t dest_ia = ALPHA_8 (~d);
1008 UN8x4_MUL_UN8 (s, m);
1010 src_ia = ALPHA_8 (~s);
1012 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
1019 vmx_combine_xor_u (pixman_implementation_t *imp,
1022 const uint32_t * src,
1023 const uint32_t * mask,
1027 vmx_combine_xor_u_mask (dest, src, mask, width);
1029 vmx_combine_xor_u_no_mask (dest, src, width);
1033 vmx_combine_add_u_no_mask (uint32_t * dest,
1034 const uint32_t *src,
1038 vector unsigned int vdest, vsrc;
1039 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
1040 dest_mask, src_mask, store_mask;
1042 COMPUTE_SHIFT_MASKS (dest, src);
1043 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1044 for (i = width / 4; i > 0; i--)
1046 LOAD_VECTORS (dest, src);
1048 vdest = pix_add (vsrc, vdest);
1050 STORE_VECTOR (dest);
1056 for (i = width % 4; --i >= 0;)
1058 uint32_t s = src[i];
1059 uint32_t d = dest[i];
1061 UN8x4_ADD_UN8x4 (d, s);
1068 vmx_combine_add_u_mask (uint32_t * dest,
1069 const uint32_t *src,
1070 const uint32_t *mask,
1074 vector unsigned int vdest, vsrc, vmask;
1075 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
1076 dest_mask, src_mask, mask_mask, store_mask;
1078 COMPUTE_SHIFT_MASKC (dest, src, mask);
1080 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1081 for (i = width / 4; i > 0; i--)
1083 LOAD_VECTORSM (dest, src, mask);
1085 vdest = pix_add (vsrc, vdest);
1087 STORE_VECTOR (dest);
1094 for (i = width % 4; --i >= 0;)
1096 uint32_t m = ALPHA_8 (mask[i]);
1097 uint32_t s = src[i];
1098 uint32_t d = dest[i];
1100 UN8x4_MUL_UN8 (s, m);
1101 UN8x4_ADD_UN8x4 (d, s);
1108 vmx_combine_add_u (pixman_implementation_t *imp,
1111 const uint32_t * src,
1112 const uint32_t * mask,
1116 vmx_combine_add_u_mask (dest, src, mask, width);
1118 vmx_combine_add_u_no_mask (dest, src, width);
1122 vmx_combine_src_ca (pixman_implementation_t *imp,
1125 const uint32_t * src,
1126 const uint32_t * mask,
1130 vector unsigned int vdest, vsrc, vmask;
1131 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
1132 dest_mask, mask_mask, src_mask, store_mask;
1134 COMPUTE_SHIFT_MASKC (dest, src, mask);
1136 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1137 for (i = width / 4; i > 0; i--)
1139 LOAD_VECTORSC (dest, src, mask);
1141 vdest = pix_multiply (vsrc, vmask);
1143 STORE_VECTOR (dest);
1150 for (i = width % 4; --i >= 0;)
1152 uint32_t a = mask[i];
1153 uint32_t s = src[i];
1155 UN8x4_MUL_UN8x4 (s, a);
1162 vmx_combine_over_ca (pixman_implementation_t *imp,
1165 const uint32_t * src,
1166 const uint32_t * mask,
1170 vector unsigned int vdest, vsrc, vmask;
1171 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
1172 dest_mask, mask_mask, src_mask, store_mask;
1174 COMPUTE_SHIFT_MASKC (dest, src, mask);
1176 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1177 for (i = width / 4; i > 0; i--)
1179 LOAD_VECTORSC (dest, src, mask);
1181 vdest = in_over (vsrc, splat_alpha (vsrc), vmask, vdest);
1183 STORE_VECTOR (dest);
1190 for (i = width % 4; --i >= 0;)
1192 uint32_t a = mask[i];
1193 uint32_t s = src[i];
1194 uint32_t d = dest[i];
1195 uint32_t sa = ALPHA_8 (s);
1197 UN8x4_MUL_UN8x4 (s, a);
1198 UN8x4_MUL_UN8 (a, sa);
1199 UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s);
1206 vmx_combine_over_reverse_ca (pixman_implementation_t *imp,
1209 const uint32_t * src,
1210 const uint32_t * mask,
1214 vector unsigned int vdest, vsrc, vmask;
1215 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
1216 dest_mask, mask_mask, src_mask, store_mask;
1218 COMPUTE_SHIFT_MASKC (dest, src, mask);
1220 /* printf("%s\n",__PRETTY_FUNCTION__); */
1221 for (i = width / 4; i > 0; i--)
1223 LOAD_VECTORSC (dest, src, mask);
1225 vdest = over (vdest, splat_alpha (vdest), pix_multiply (vsrc, vmask));
1227 STORE_VECTOR (dest);
1234 for (i = width % 4; --i >= 0;)
1236 uint32_t a = mask[i];
1237 uint32_t s = src[i];
1238 uint32_t d = dest[i];
1239 uint32_t ida = ALPHA_8 (~d);
1241 UN8x4_MUL_UN8x4 (s, a);
1242 UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d);
1249 vmx_combine_in_ca (pixman_implementation_t *imp,
1252 const uint32_t * src,
1253 const uint32_t * mask,
1257 vector unsigned int vdest, vsrc, vmask;
1258 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
1259 dest_mask, mask_mask, src_mask, store_mask;
1261 COMPUTE_SHIFT_MASKC (dest, src, mask);
1263 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1264 for (i = width / 4; i > 0; i--)
1266 LOAD_VECTORSC (dest, src, mask);
1268 vdest = pix_multiply (pix_multiply (vsrc, vmask), splat_alpha (vdest));
1270 STORE_VECTOR (dest);
1277 for (i = width % 4; --i >= 0;)
1279 uint32_t a = mask[i];
1280 uint32_t s = src[i];
1281 uint32_t da = ALPHA_8 (dest[i]);
1283 UN8x4_MUL_UN8x4 (s, a);
1284 UN8x4_MUL_UN8 (s, da);
1291 vmx_combine_in_reverse_ca (pixman_implementation_t *imp,
1294 const uint32_t * src,
1295 const uint32_t * mask,
1299 vector unsigned int vdest, vsrc, vmask;
1300 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
1301 dest_mask, mask_mask, src_mask, store_mask;
1303 COMPUTE_SHIFT_MASKC (dest, src, mask);
1305 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1306 for (i = width / 4; i > 0; i--)
1309 LOAD_VECTORSC (dest, src, mask);
1311 vdest = pix_multiply (vdest, pix_multiply (vmask, splat_alpha (vsrc)));
1313 STORE_VECTOR (dest);
1320 for (i = width % 4; --i >= 0;)
1322 uint32_t a = mask[i];
1323 uint32_t d = dest[i];
1324 uint32_t sa = ALPHA_8 (src[i]);
1326 UN8x4_MUL_UN8 (a, sa);
1327 UN8x4_MUL_UN8x4 (d, a);
1334 vmx_combine_out_ca (pixman_implementation_t *imp,
1337 const uint32_t * src,
1338 const uint32_t * mask,
1342 vector unsigned int vdest, vsrc, vmask;
1343 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
1344 dest_mask, mask_mask, src_mask, store_mask;
1346 COMPUTE_SHIFT_MASKC (dest, src, mask);
1348 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1349 for (i = width / 4; i > 0; i--)
1351 LOAD_VECTORSC (dest, src, mask);
1353 vdest = pix_multiply (
1354 pix_multiply (vsrc, vmask), splat_alpha (negate (vdest)));
1356 STORE_VECTOR (dest);
1363 for (i = width % 4; --i >= 0;)
1365 uint32_t a = mask[i];
1366 uint32_t s = src[i];
1367 uint32_t d = dest[i];
1368 uint32_t da = ALPHA_8 (~d);
1370 UN8x4_MUL_UN8x4 (s, a);
1371 UN8x4_MUL_UN8 (s, da);
1378 vmx_combine_out_reverse_ca (pixman_implementation_t *imp,
1381 const uint32_t * src,
1382 const uint32_t * mask,
1386 vector unsigned int vdest, vsrc, vmask;
1387 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
1388 dest_mask, mask_mask, src_mask, store_mask;
1390 COMPUTE_SHIFT_MASKC (dest, src, mask);
1392 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1393 for (i = width / 4; i > 0; i--)
1395 LOAD_VECTORSC (dest, src, mask);
1397 vdest = pix_multiply (
1398 vdest, negate (pix_multiply (vmask, splat_alpha (vsrc))));
1400 STORE_VECTOR (dest);
1407 for (i = width % 4; --i >= 0;)
1409 uint32_t a = mask[i];
1410 uint32_t s = src[i];
1411 uint32_t d = dest[i];
1412 uint32_t sa = ALPHA_8 (s);
1414 UN8x4_MUL_UN8 (a, sa);
1415 UN8x4_MUL_UN8x4 (d, ~a);
1422 vmx_combine_atop_ca (pixman_implementation_t *imp,
1425 const uint32_t * src,
1426 const uint32_t * mask,
1430 vector unsigned int vdest, vsrc, vmask, vsrca;
1431 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
1432 dest_mask, mask_mask, src_mask, store_mask;
1434 COMPUTE_SHIFT_MASKC (dest, src, mask);
1436 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1437 for (i = width / 4; i > 0; i--)
1439 LOAD_VECTORSC (dest, src, mask);
1441 vsrca = splat_alpha (vsrc);
1443 vsrc = pix_multiply (vsrc, vmask);
1444 vmask = pix_multiply (vmask, vsrca);
1446 vdest = pix_add_mul (vsrc, splat_alpha (vdest),
1447 negate (vmask), vdest);
1449 STORE_VECTOR (dest);
1456 for (i = width % 4; --i >= 0;)
1458 uint32_t a = mask[i];
1459 uint32_t s = src[i];
1460 uint32_t d = dest[i];
1461 uint32_t sa = ALPHA_8 (s);
1462 uint32_t da = ALPHA_8 (d);
1464 UN8x4_MUL_UN8x4 (s, a);
1465 UN8x4_MUL_UN8 (a, sa);
1466 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
1473 vmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
1476 const uint32_t * src,
1477 const uint32_t * mask,
1481 vector unsigned int vdest, vsrc, vmask;
1482 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
1483 dest_mask, mask_mask, src_mask, store_mask;
1485 COMPUTE_SHIFT_MASKC (dest, src, mask);
1487 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1488 for (i = width / 4; i > 0; i--)
1490 LOAD_VECTORSC (dest, src, mask);
1492 vdest = pix_add_mul (vdest,
1493 pix_multiply (vmask, splat_alpha (vsrc)),
1494 pix_multiply (vsrc, vmask),
1495 negate (splat_alpha (vdest)));
1497 STORE_VECTOR (dest);
1504 for (i = width % 4; --i >= 0;)
1506 uint32_t a = mask[i];
1507 uint32_t s = src[i];
1508 uint32_t d = dest[i];
1509 uint32_t sa = ALPHA_8 (s);
1510 uint32_t da = ALPHA_8 (~d);
1512 UN8x4_MUL_UN8x4 (s, a);
1513 UN8x4_MUL_UN8 (a, sa);
1514 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da);
1521 vmx_combine_xor_ca (pixman_implementation_t *imp,
1524 const uint32_t * src,
1525 const uint32_t * mask,
1529 vector unsigned int vdest, vsrc, vmask;
1530 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
1531 dest_mask, mask_mask, src_mask, store_mask;
1533 COMPUTE_SHIFT_MASKC (dest, src, mask);
1535 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1536 for (i = width / 4; i > 0; i--)
1538 LOAD_VECTORSC (dest, src, mask);
1540 vdest = pix_add_mul (vdest,
1541 negate (pix_multiply (vmask, splat_alpha (vsrc))),
1542 pix_multiply (vsrc, vmask),
1543 negate (splat_alpha (vdest)));
1545 STORE_VECTOR (dest);
1552 for (i = width % 4; --i >= 0;)
1554 uint32_t a = mask[i];
1555 uint32_t s = src[i];
1556 uint32_t d = dest[i];
1557 uint32_t sa = ALPHA_8 (s);
1558 uint32_t da = ALPHA_8 (~d);
1560 UN8x4_MUL_UN8x4 (s, a);
1561 UN8x4_MUL_UN8 (a, sa);
1562 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
1569 vmx_combine_add_ca (pixman_implementation_t *imp,
1572 const uint32_t * src,
1573 const uint32_t * mask,
1577 vector unsigned int vdest, vsrc, vmask;
1578 vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
1579 dest_mask, mask_mask, src_mask, store_mask;
1581 COMPUTE_SHIFT_MASKC (dest, src, mask);
1583 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1584 for (i = width / 4; i > 0; i--)
1586 LOAD_VECTORSC (dest, src, mask);
1588 vdest = pix_add (pix_multiply (vsrc, vmask), vdest);
1590 STORE_VECTOR (dest);
1597 for (i = width % 4; --i >= 0;)
1599 uint32_t a = mask[i];
1600 uint32_t s = src[i];
1601 uint32_t d = dest[i];
1603 UN8x4_MUL_UN8x4 (s, a);
1604 UN8x4_ADD_UN8x4 (s, d);
1610 static const pixman_fast_path_t vmx_fast_paths[] =
1615 pixman_implementation_t *
1616 _pixman_implementation_create_vmx (pixman_implementation_t *fallback)
1618 pixman_implementation_t *imp = _pixman_implementation_create (fallback, vmx_fast_paths);
1620 /* Set up function pointers */
1622 imp->combine_32[PIXMAN_OP_OVER] = vmx_combine_over_u;
1623 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_u;
1624 imp->combine_32[PIXMAN_OP_IN] = vmx_combine_in_u;
1625 imp->combine_32[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_u;
1626 imp->combine_32[PIXMAN_OP_OUT] = vmx_combine_out_u;
1627 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_u;
1628 imp->combine_32[PIXMAN_OP_ATOP] = vmx_combine_atop_u;
1629 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_u;
1630 imp->combine_32[PIXMAN_OP_XOR] = vmx_combine_xor_u;
1632 imp->combine_32[PIXMAN_OP_ADD] = vmx_combine_add_u;
1634 imp->combine_32_ca[PIXMAN_OP_SRC] = vmx_combine_src_ca;
1635 imp->combine_32_ca[PIXMAN_OP_OVER] = vmx_combine_over_ca;
1636 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_ca;
1637 imp->combine_32_ca[PIXMAN_OP_IN] = vmx_combine_in_ca;
1638 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_ca;
1639 imp->combine_32_ca[PIXMAN_OP_OUT] = vmx_combine_out_ca;
1640 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_ca;
1641 imp->combine_32_ca[PIXMAN_OP_ATOP] = vmx_combine_atop_ca;
1642 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_ca;
1643 imp->combine_32_ca[PIXMAN_OP_XOR] = vmx_combine_xor_ca;
1644 imp->combine_32_ca[PIXMAN_OP_ADD] = vmx_combine_add_ca;