92f030871a7ed1e1ba5285796fad0eb4cb8770d5
[profile/ivi/pixman.git] / pixman / pixman-fast-path.c
1 /* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
2 /*
3  * Copyright © 2000 SuSE, Inc.
4  * Copyright © 2007 Red Hat, Inc.
5  *
6  * Permission to use, copy, modify, distribute, and sell this software and its
7  * documentation for any purpose is hereby granted without fee, provided that
8  * the above copyright notice appear in all copies and that both that
9  * copyright notice and this permission notice appear in supporting
10  * documentation, and that the name of SuSE not be used in advertising or
11  * publicity pertaining to distribution of the software without specific,
12  * written prior permission.  SuSE makes no representations about the
13  * suitability of this software for any purpose.  It is provided "as is"
14  * without express or implied warranty.
15  *
16  * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
18  * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
20  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
21  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22  *
23  * Author:  Keith Packard, SuSE, Inc.
24  */
25
26 #ifdef HAVE_CONFIG_H
27 #include <config.h>
28 #endif
29 #include <string.h>
30 #include <stdlib.h>
31 #include "pixman-private.h"
32 #include "pixman-combine32.h"
33 #include "pixman-fast-path.h"
34
35 static force_inline uint32_t
36 fetch_24 (uint8_t *a)
37 {
38     if (((unsigned long)a) & 1)
39     {
40 #ifdef WORDS_BIGENDIAN
41         return (*a << 16) | (*(uint16_t *)(a + 1));
42 #else
43         return *a | (*(uint16_t *)(a + 1) << 8);
44 #endif
45     }
46     else
47     {
48 #ifdef WORDS_BIGENDIAN
49         return (*(uint16_t *)a << 8) | *(a + 2);
50 #else
51         return *(uint16_t *)a | (*(a + 2) << 16);
52 #endif
53     }
54 }
55
56 static force_inline void
57 store_24 (uint8_t *a,
58           uint32_t v)
59 {
60     if (((unsigned long)a) & 1)
61     {
62 #ifdef WORDS_BIGENDIAN
63         *a = (uint8_t) (v >> 16);
64         *(uint16_t *)(a + 1) = (uint16_t) (v);
65 #else
66         *a = (uint8_t) (v);
67         *(uint16_t *)(a + 1) = (uint16_t) (v >> 8);
68 #endif
69     }
70     else
71     {
72 #ifdef WORDS_BIGENDIAN
73         *(uint16_t *)a = (uint16_t)(v >> 8);
74         *(a + 2) = (uint8_t)v;
75 #else
76         *(uint16_t *)a = (uint16_t)v;
77         *(a + 2) = (uint8_t)(v >> 16);
78 #endif
79     }
80 }
81
82 static force_inline uint32_t
83 over (uint32_t src,
84       uint32_t dest)
85 {
86     uint32_t a = ~src >> 24;
87
88     UN8x4_MUL_UN8_ADD_UN8x4 (dest, a, src);
89
90     return dest;
91 }
92
93 static uint32_t
94 in (uint32_t x,
95     uint8_t  y)
96 {
97     uint16_t a = y;
98
99     UN8x4_MUL_UN8 (x, a);
100
101     return x;
102 }
103
104 /*
105  * Naming convention:
106  *
107  *  op_src_mask_dest
108  */
109 static void
110 fast_composite_over_x888_8_8888 (pixman_implementation_t *imp,
111                                  pixman_op_t              op,
112                                  pixman_image_t *         src_image,
113                                  pixman_image_t *         mask_image,
114                                  pixman_image_t *         dst_image,
115                                  int32_t                  src_x,
116                                  int32_t                  src_y,
117                                  int32_t                  mask_x,
118                                  int32_t                  mask_y,
119                                  int32_t                  dest_x,
120                                  int32_t                  dest_y,
121                                  int32_t                  width,
122                                  int32_t                  height)
123 {
124     uint32_t    *src, *src_line;
125     uint32_t    *dst, *dst_line;
126     uint8_t     *mask, *mask_line;
127     int src_stride, mask_stride, dst_stride;
128     uint8_t m;
129     uint32_t s, d;
130     int32_t w;
131
132     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
133     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
134     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
135
136     while (height--)
137     {
138         src = src_line;
139         src_line += src_stride;
140         dst = dst_line;
141         dst_line += dst_stride;
142         mask = mask_line;
143         mask_line += mask_stride;
144
145         w = width;
146         while (w--)
147         {
148             m = *mask++;
149             if (m)
150             {
151                 s = *src | 0xff000000;
152
153                 if (m == 0xff)
154                 {
155                     *dst = s;
156                 }
157                 else
158                 {
159                     d = in (s, m);
160                     *dst = over (d, *dst);
161                 }
162             }
163             src++;
164             dst++;
165         }
166     }
167 }
168
169 static void
170 fast_composite_in_n_8_8 (pixman_implementation_t *imp,
171                          pixman_op_t              op,
172                          pixman_image_t *         src_image,
173                          pixman_image_t *         mask_image,
174                          pixman_image_t *         dest_image,
175                          int32_t                  src_x,
176                          int32_t                  src_y,
177                          int32_t                  mask_x,
178                          int32_t                  mask_y,
179                          int32_t                  dest_x,
180                          int32_t                  dest_y,
181                          int32_t                  width,
182                          int32_t                  height)
183 {
184     uint32_t src, srca;
185     uint8_t     *dst_line, *dst;
186     uint8_t     *mask_line, *mask, m;
187     int dst_stride, mask_stride;
188     int32_t w;
189     uint16_t t;
190
191     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
192
193     srca = src >> 24;
194
195     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
196     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
197
198     if (srca == 0xff)
199     {
200         while (height--)
201         {
202             dst = dst_line;
203             dst_line += dst_stride;
204             mask = mask_line;
205             mask_line += mask_stride;
206             w = width;
207
208             while (w--)
209             {
210                 m = *mask++;
211
212                 if (m == 0)
213                     *dst = 0;
214                 else if (m != 0xff)
215                     *dst = MUL_UN8 (m, *dst, t);
216
217                 dst++;
218             }
219         }
220     }
221     else
222     {
223         while (height--)
224         {
225             dst = dst_line;
226             dst_line += dst_stride;
227             mask = mask_line;
228             mask_line += mask_stride;
229             w = width;
230
231             while (w--)
232             {
233                 m = *mask++;
234                 m = MUL_UN8 (m, srca, t);
235
236                 if (m == 0)
237                     *dst = 0;
238                 else if (m != 0xff)
239                     *dst = MUL_UN8 (m, *dst, t);
240
241                 dst++;
242             }
243         }
244     }
245 }
246
247 static void
248 fast_composite_in_8_8 (pixman_implementation_t *imp,
249                        pixman_op_t              op,
250                        pixman_image_t *         src_image,
251                        pixman_image_t *         mask_image,
252                        pixman_image_t *         dest_image,
253                        int32_t                  src_x,
254                        int32_t                  src_y,
255                        int32_t                  mask_x,
256                        int32_t                  mask_y,
257                        int32_t                  dest_x,
258                        int32_t                  dest_y,
259                        int32_t                  width,
260                        int32_t                  height)
261 {
262     uint8_t     *dst_line, *dst;
263     uint8_t     *src_line, *src;
264     int dst_stride, src_stride;
265     int32_t w;
266     uint8_t s;
267     uint16_t t;
268
269     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
270     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
271
272     while (height--)
273     {
274         dst = dst_line;
275         dst_line += dst_stride;
276         src = src_line;
277         src_line += src_stride;
278         w = width;
279
280         while (w--)
281         {
282             s = *src++;
283
284             if (s == 0)
285                 *dst = 0;
286             else if (s != 0xff)
287                 *dst = MUL_UN8 (s, *dst, t);
288
289             dst++;
290         }
291     }
292 }
293
294 static void
295 fast_composite_over_n_8_8888 (pixman_implementation_t *imp,
296                               pixman_op_t              op,
297                               pixman_image_t *         src_image,
298                               pixman_image_t *         mask_image,
299                               pixman_image_t *         dst_image,
300                               int32_t                  src_x,
301                               int32_t                  src_y,
302                               int32_t                  mask_x,
303                               int32_t                  mask_y,
304                               int32_t                  dest_x,
305                               int32_t                  dest_y,
306                               int32_t                  width,
307                               int32_t                  height)
308 {
309     uint32_t src, srca;
310     uint32_t    *dst_line, *dst, d;
311     uint8_t     *mask_line, *mask, m;
312     int dst_stride, mask_stride;
313     int32_t w;
314
315     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
316
317     srca = src >> 24;
318     if (src == 0)
319         return;
320
321     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
322     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
323
324     while (height--)
325     {
326         dst = dst_line;
327         dst_line += dst_stride;
328         mask = mask_line;
329         mask_line += mask_stride;
330         w = width;
331
332         while (w--)
333         {
334             m = *mask++;
335             if (m == 0xff)
336             {
337                 if (srca == 0xff)
338                     *dst = src;
339                 else
340                     *dst = over (src, *dst);
341             }
342             else if (m)
343             {
344                 d = in (src, m);
345                 *dst = over (d, *dst);
346             }
347             dst++;
348         }
349     }
350 }
351
352 static void
353 fast_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
354                                    pixman_op_t              op,
355                                    pixman_image_t *         src_image,
356                                    pixman_image_t *         mask_image,
357                                    pixman_image_t *         dst_image,
358                                    int32_t                  src_x,
359                                    int32_t                  src_y,
360                                    int32_t                  mask_x,
361                                    int32_t                  mask_y,
362                                    int32_t                  dest_x,
363                                    int32_t                  dest_y,
364                                    int32_t                  width,
365                                    int32_t                  height)
366 {
367     uint32_t src, s;
368     uint32_t    *dst_line, *dst, d;
369     uint32_t    *mask_line, *mask, ma;
370     int dst_stride, mask_stride;
371     int32_t w;
372
373     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
374
375     if (src == 0)
376         return;
377
378     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
379     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
380
381     while (height--)
382     {
383         dst = dst_line;
384         dst_line += dst_stride;
385         mask = mask_line;
386         mask_line += mask_stride;
387         w = width;
388
389         while (w--)
390         {
391             ma = *mask++;
392
393             if (ma)
394             {
395                 d = *dst;
396                 s = src;
397
398                 UN8x4_MUL_UN8x4_ADD_UN8x4 (s, ma, d);
399
400                 *dst = s;
401             }
402
403             dst++;
404         }
405     }
406 }
407
408 static void
409 fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
410                                     pixman_op_t              op,
411                                     pixman_image_t *         src_image,
412                                     pixman_image_t *         mask_image,
413                                     pixman_image_t *         dst_image,
414                                     int32_t                  src_x,
415                                     int32_t                  src_y,
416                                     int32_t                  mask_x,
417                                     int32_t                  mask_y,
418                                     int32_t                  dest_x,
419                                     int32_t                  dest_y,
420                                     int32_t                  width,
421                                     int32_t                  height)
422 {
423     uint32_t src, srca, s;
424     uint32_t    *dst_line, *dst, d;
425     uint32_t    *mask_line, *mask, ma;
426     int dst_stride, mask_stride;
427     int32_t w;
428
429     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
430
431     srca = src >> 24;
432     if (src == 0)
433         return;
434
435     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
436     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
437
438     while (height--)
439     {
440         dst = dst_line;
441         dst_line += dst_stride;
442         mask = mask_line;
443         mask_line += mask_stride;
444         w = width;
445
446         while (w--)
447         {
448             ma = *mask++;
449             if (ma == 0xffffffff)
450             {
451                 if (srca == 0xff)
452                     *dst = src;
453                 else
454                     *dst = over (src, *dst);
455             }
456             else if (ma)
457             {
458                 d = *dst;
459                 s = src;
460
461                 UN8x4_MUL_UN8x4 (s, ma);
462                 UN8x4_MUL_UN8 (ma, srca);
463                 ma = ~ma;
464                 UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
465
466                 *dst = d;
467             }
468
469             dst++;
470         }
471     }
472 }
473
474 static void
475 fast_composite_over_n_8_0888 (pixman_implementation_t *imp,
476                               pixman_op_t              op,
477                               pixman_image_t *         src_image,
478                               pixman_image_t *         mask_image,
479                               pixman_image_t *         dst_image,
480                               int32_t                  src_x,
481                               int32_t                  src_y,
482                               int32_t                  mask_x,
483                               int32_t                  mask_y,
484                               int32_t                  dest_x,
485                               int32_t                  dest_y,
486                               int32_t                  width,
487                               int32_t                  height)
488 {
489     uint32_t src, srca;
490     uint8_t     *dst_line, *dst;
491     uint32_t d;
492     uint8_t     *mask_line, *mask, m;
493     int dst_stride, mask_stride;
494     int32_t w;
495
496     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
497
498     srca = src >> 24;
499     if (src == 0)
500         return;
501
502     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
503     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
504
505     while (height--)
506     {
507         dst = dst_line;
508         dst_line += dst_stride;
509         mask = mask_line;
510         mask_line += mask_stride;
511         w = width;
512
513         while (w--)
514         {
515             m = *mask++;
516             if (m == 0xff)
517             {
518                 if (srca == 0xff)
519                 {
520                     d = src;
521                 }
522                 else
523                 {
524                     d = fetch_24 (dst);
525                     d = over (src, d);
526                 }
527                 store_24 (dst, d);
528             }
529             else if (m)
530             {
531                 d = over (in (src, m), fetch_24 (dst));
532                 store_24 (dst, d);
533             }
534             dst += 3;
535         }
536     }
537 }
538
539 static void
540 fast_composite_over_n_8_0565 (pixman_implementation_t *imp,
541                               pixman_op_t              op,
542                               pixman_image_t *         src_image,
543                               pixman_image_t *         mask_image,
544                               pixman_image_t *         dst_image,
545                               int32_t                  src_x,
546                               int32_t                  src_y,
547                               int32_t                  mask_x,
548                               int32_t                  mask_y,
549                               int32_t                  dest_x,
550                               int32_t                  dest_y,
551                               int32_t                  width,
552                               int32_t                  height)
553 {
554     uint32_t src, srca;
555     uint16_t    *dst_line, *dst;
556     uint32_t d;
557     uint8_t     *mask_line, *mask, m;
558     int dst_stride, mask_stride;
559     int32_t w;
560
561     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
562
563     srca = src >> 24;
564     if (src == 0)
565         return;
566
567     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
568     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
569
570     while (height--)
571     {
572         dst = dst_line;
573         dst_line += dst_stride;
574         mask = mask_line;
575         mask_line += mask_stride;
576         w = width;
577
578         while (w--)
579         {
580             m = *mask++;
581             if (m == 0xff)
582             {
583                 if (srca == 0xff)
584                 {
585                     d = src;
586                 }
587                 else
588                 {
589                     d = *dst;
590                     d = over (src, CONVERT_0565_TO_0888 (d));
591                 }
592                 *dst = CONVERT_8888_TO_0565 (d);
593             }
594             else if (m)
595             {
596                 d = *dst;
597                 d = over (in (src, m), CONVERT_0565_TO_0888 (d));
598                 *dst = CONVERT_8888_TO_0565 (d);
599             }
600             dst++;
601         }
602     }
603 }
604
605 static void
606 fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
607                                     pixman_op_t              op,
608                                     pixman_image_t *         src_image,
609                                     pixman_image_t *         mask_image,
610                                     pixman_image_t *         dst_image,
611                                     int32_t                  src_x,
612                                     int32_t                  src_y,
613                                     int32_t                  mask_x,
614                                     int32_t                  mask_y,
615                                     int32_t                  dest_x,
616                                     int32_t                  dest_y,
617                                     int32_t                  width,
618                                     int32_t                  height)
619 {
620     uint32_t  src, srca, s;
621     uint16_t  src16;
622     uint16_t *dst_line, *dst;
623     uint32_t  d;
624     uint32_t *mask_line, *mask, ma;
625     int dst_stride, mask_stride;
626     int32_t w;
627
628     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
629
630     srca = src >> 24;
631     if (src == 0)
632         return;
633
634     src16 = CONVERT_8888_TO_0565 (src);
635
636     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
637     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
638
639     while (height--)
640     {
641         dst = dst_line;
642         dst_line += dst_stride;
643         mask = mask_line;
644         mask_line += mask_stride;
645         w = width;
646
647         while (w--)
648         {
649             ma = *mask++;
650             if (ma == 0xffffffff)
651             {
652                 if (srca == 0xff)
653                 {
654                     *dst = src16;
655                 }
656                 else
657                 {
658                     d = *dst;
659                     d = over (src, CONVERT_0565_TO_0888 (d));
660                     *dst = CONVERT_8888_TO_0565 (d);
661                 }
662             }
663             else if (ma)
664             {
665                 d = *dst;
666                 d = CONVERT_0565_TO_0888 (d);
667
668                 s = src;
669
670                 UN8x4_MUL_UN8x4 (s, ma);
671                 UN8x4_MUL_UN8 (ma, srca);
672                 ma = ~ma;
673                 UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
674
675                 *dst = CONVERT_8888_TO_0565 (d);
676             }
677             dst++;
678         }
679     }
680 }
681
682 static void
683 fast_composite_over_8888_8888 (pixman_implementation_t *imp,
684                                pixman_op_t              op,
685                                pixman_image_t *         src_image,
686                                pixman_image_t *         mask_image,
687                                pixman_image_t *         dst_image,
688                                int32_t                  src_x,
689                                int32_t                  src_y,
690                                int32_t                  mask_x,
691                                int32_t                  mask_y,
692                                int32_t                  dest_x,
693                                int32_t                  dest_y,
694                                int32_t                  width,
695                                int32_t                  height)
696 {
697     uint32_t    *dst_line, *dst;
698     uint32_t    *src_line, *src, s;
699     int dst_stride, src_stride;
700     uint8_t a;
701     int32_t w;
702
703     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
704     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
705
706     while (height--)
707     {
708         dst = dst_line;
709         dst_line += dst_stride;
710         src = src_line;
711         src_line += src_stride;
712         w = width;
713
714         while (w--)
715         {
716             s = *src++;
717             a = s >> 24;
718             if (a == 0xff)
719                 *dst = s;
720             else if (s)
721                 *dst = over (s, *dst);
722             dst++;
723         }
724     }
725 }
726
727 static void
728 fast_composite_src_x888_8888 (pixman_implementation_t *imp,
729                               pixman_op_t              op,
730                               pixman_image_t *         src_image,
731                               pixman_image_t *         mask_image,
732                               pixman_image_t *         dst_image,
733                               int32_t                  src_x,
734                               int32_t                  src_y,
735                               int32_t                  mask_x,
736                               int32_t                  mask_y,
737                               int32_t                  dest_x,
738                               int32_t                  dest_y,
739                               int32_t                  width,
740                               int32_t                  height)
741 {
742     uint32_t    *dst_line, *dst;
743     uint32_t    *src_line, *src;
744     int dst_stride, src_stride;
745     int32_t w;
746
747     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
748     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
749
750     while (height--)
751     {
752         dst = dst_line;
753         dst_line += dst_stride;
754         src = src_line;
755         src_line += src_stride;
756         w = width;
757
758         while (w--)
759             *dst++ = (*src++) | 0xff000000;
760     }
761 }
762
763 #if 0
764 static void
765 fast_composite_over_8888_0888 (pixman_implementation_t *imp,
766                                pixman_op_t              op,
767                                pixman_image_t *         src_image,
768                                pixman_image_t *         mask_image,
769                                pixman_image_t *         dst_image,
770                                int32_t                  src_x,
771                                int32_t                  src_y,
772                                int32_t                  mask_x,
773                                int32_t                  mask_y,
774                                int32_t                  dest_x,
775                                int32_t                  dest_y,
776                                int32_t                  width,
777                                int32_t                  height)
778 {
779     uint8_t     *dst_line, *dst;
780     uint32_t d;
781     uint32_t    *src_line, *src, s;
782     uint8_t a;
783     int dst_stride, src_stride;
784     int32_t w;
785
786     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
787     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
788
789     while (height--)
790     {
791         dst = dst_line;
792         dst_line += dst_stride;
793         src = src_line;
794         src_line += src_stride;
795         w = width;
796
797         while (w--)
798         {
799             s = *src++;
800             a = s >> 24;
801             if (a)
802             {
803                 if (a == 0xff)
804                     d = s;
805                 else
806                     d = over (s, fetch_24 (dst));
807
808                 store_24 (dst, d);
809             }
810             dst += 3;
811         }
812     }
813 }
814 #endif
815
816 static void
817 fast_composite_over_8888_0565 (pixman_implementation_t *imp,
818                                pixman_op_t              op,
819                                pixman_image_t *         src_image,
820                                pixman_image_t *         mask_image,
821                                pixman_image_t *         dst_image,
822                                int32_t                  src_x,
823                                int32_t                  src_y,
824                                int32_t                  mask_x,
825                                int32_t                  mask_y,
826                                int32_t                  dest_x,
827                                int32_t                  dest_y,
828                                int32_t                  width,
829                                int32_t                  height)
830 {
831     uint16_t    *dst_line, *dst;
832     uint32_t d;
833     uint32_t    *src_line, *src, s;
834     uint8_t a;
835     int dst_stride, src_stride;
836     int32_t w;
837
838     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
839     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
840
841     while (height--)
842     {
843         dst = dst_line;
844         dst_line += dst_stride;
845         src = src_line;
846         src_line += src_stride;
847         w = width;
848
849         while (w--)
850         {
851             s = *src++;
852             a = s >> 24;
853             if (s)
854             {
855                 if (a == 0xff)
856                 {
857                     d = s;
858                 }
859                 else
860                 {
861                     d = *dst;
862                     d = over (s, CONVERT_0565_TO_0888 (d));
863                 }
864                 *dst = CONVERT_8888_TO_0565 (d);
865             }
866             dst++;
867         }
868     }
869 }
870
871 static void
872 fast_composite_src_x888_0565 (pixman_implementation_t *imp,
873                               pixman_op_t              op,
874                               pixman_image_t *         src_image,
875                               pixman_image_t *         mask_image,
876                               pixman_image_t *         dst_image,
877                               int32_t                  src_x,
878                               int32_t                  src_y,
879                               int32_t                  mask_x,
880                               int32_t                  mask_y,
881                               int32_t                  dest_x,
882                               int32_t                  dest_y,
883                               int32_t                  width,
884                               int32_t                  height)
885 {
886     uint16_t    *dst_line, *dst;
887     uint32_t    *src_line, *src, s;
888     int dst_stride, src_stride;
889     int32_t w;
890
891     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
892     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
893
894     while (height--)
895     {
896         dst = dst_line;
897         dst_line += dst_stride;
898         src = src_line;
899         src_line += src_stride;
900         w = width;
901
902         while (w--)
903         {
904             s = *src++;
905             *dst = CONVERT_8888_TO_0565 (s);
906             dst++;
907         }
908     }
909 }
910
911 static void
912 fast_composite_add_8_8 (pixman_implementation_t *imp,
913                         pixman_op_t              op,
914                         pixman_image_t *         src_image,
915                         pixman_image_t *         mask_image,
916                         pixman_image_t *         dst_image,
917                         int32_t                  src_x,
918                         int32_t                  src_y,
919                         int32_t                  mask_x,
920                         int32_t                  mask_y,
921                         int32_t                  dest_x,
922                         int32_t                  dest_y,
923                         int32_t                  width,
924                         int32_t                  height)
925 {
926     uint8_t     *dst_line, *dst;
927     uint8_t     *src_line, *src;
928     int dst_stride, src_stride;
929     int32_t w;
930     uint8_t s, d;
931     uint16_t t;
932
933     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
934     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
935
936     while (height--)
937     {
938         dst = dst_line;
939         dst_line += dst_stride;
940         src = src_line;
941         src_line += src_stride;
942         w = width;
943
944         while (w--)
945         {
946             s = *src++;
947             if (s)
948             {
949                 if (s != 0xff)
950                 {
951                     d = *dst;
952                     t = d + s;
953                     s = t | (0 - (t >> 8));
954                 }
955                 *dst = s;
956             }
957             dst++;
958         }
959     }
960 }
961
962 static void
963 fast_composite_add_8888_8888 (pixman_implementation_t *imp,
964                               pixman_op_t              op,
965                               pixman_image_t *         src_image,
966                               pixman_image_t *         mask_image,
967                               pixman_image_t *         dst_image,
968                               int32_t                  src_x,
969                               int32_t                  src_y,
970                               int32_t                  mask_x,
971                               int32_t                  mask_y,
972                               int32_t                  dest_x,
973                               int32_t                  dest_y,
974                               int32_t                  width,
975                               int32_t                  height)
976 {
977     uint32_t    *dst_line, *dst;
978     uint32_t    *src_line, *src;
979     int dst_stride, src_stride;
980     int32_t w;
981     uint32_t s, d;
982
983     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
984     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
985
986     while (height--)
987     {
988         dst = dst_line;
989         dst_line += dst_stride;
990         src = src_line;
991         src_line += src_stride;
992         w = width;
993
994         while (w--)
995         {
996             s = *src++;
997             if (s)
998             {
999                 if (s != 0xffffffff)
1000                 {
1001                     d = *dst;
1002                     if (d)
1003                         UN8x4_ADD_UN8x4 (s, d);
1004                 }
1005                 *dst = s;
1006             }
1007             dst++;
1008         }
1009     }
1010 }
1011
1012 static void
1013 fast_composite_add_n_8_8 (pixman_implementation_t *imp,
1014                           pixman_op_t              op,
1015                           pixman_image_t *         src_image,
1016                           pixman_image_t *         mask_image,
1017                           pixman_image_t *         dst_image,
1018                           int32_t                  src_x,
1019                           int32_t                  src_y,
1020                           int32_t                  mask_x,
1021                           int32_t                  mask_y,
1022                           int32_t                  dest_x,
1023                           int32_t                  dest_y,
1024                           int32_t                  width,
1025                           int32_t                  height)
1026 {
1027     uint8_t     *dst_line, *dst;
1028     uint8_t     *mask_line, *mask;
1029     int dst_stride, mask_stride;
1030     int32_t w;
1031     uint32_t src;
1032     uint8_t sa;
1033
1034     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
1035     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1036     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
1037     sa = (src >> 24);
1038
1039     while (height--)
1040     {
1041         dst = dst_line;
1042         dst_line += dst_stride;
1043         mask = mask_line;
1044         mask_line += mask_stride;
1045         w = width;
1046
1047         while (w--)
1048         {
1049             uint16_t tmp;
1050             uint16_t a;
1051             uint32_t m, d;
1052             uint32_t r;
1053
1054             a = *mask++;
1055             d = *dst;
1056
1057             m = MUL_UN8 (sa, a, tmp);
1058             r = ADD_UN8 (m, d, tmp);
1059
1060             *dst++ = r;
1061         }
1062     }
1063 }
1064
1065 #ifdef WORDS_BIGENDIAN
1066 #define CREATE_BITMASK(n) (0x80000000 >> (n))
1067 #define UPDATE_BITMASK(n) ((n) >> 1)
1068 #else
1069 #define CREATE_BITMASK(n) (1 << (n))
1070 #define UPDATE_BITMASK(n) ((n) << 1)
1071 #endif
1072
1073 #define TEST_BIT(p, n)                                  \
1074     (*((p) + ((n) >> 5)) & CREATE_BITMASK ((n) & 31))
1075 #define SET_BIT(p, n)                                                   \
1076     do { *((p) + ((n) >> 5)) |= CREATE_BITMASK ((n) & 31); } while (0);
1077
1078 static void
1079 fast_composite_add_1000_1000 (pixman_implementation_t *imp,
1080                               pixman_op_t              op,
1081                               pixman_image_t *         src_image,
1082                               pixman_image_t *         mask_image,
1083                               pixman_image_t *         dst_image,
1084                               int32_t                  src_x,
1085                               int32_t                  src_y,
1086                               int32_t                  mask_x,
1087                               int32_t                  mask_y,
1088                               int32_t                  dest_x,
1089                               int32_t                  dest_y,
1090                               int32_t                  width,
1091                               int32_t                  height)
1092 {
1093     uint32_t     *dst_line, *dst;
1094     uint32_t     *src_line, *src;
1095     int           dst_stride, src_stride;
1096     int32_t       w;
1097
1098     PIXMAN_IMAGE_GET_LINE (src_image, 0, src_y, uint32_t,
1099                            src_stride, src_line, 1);
1100     PIXMAN_IMAGE_GET_LINE (dst_image, 0, dest_y, uint32_t,
1101                            dst_stride, dst_line, 1);
1102
1103     while (height--)
1104     {
1105         dst = dst_line;
1106         dst_line += dst_stride;
1107         src = src_line;
1108         src_line += src_stride;
1109         w = width;
1110
1111         while (w--)
1112         {
1113             /*
1114              * TODO: improve performance by processing uint32_t data instead
1115              *       of individual bits
1116              */
1117             if (TEST_BIT (src, src_x + w))
1118                 SET_BIT (dst, dest_x + w);
1119         }
1120     }
1121 }
1122
1123 static void
1124 fast_composite_over_n_1_8888 (pixman_implementation_t *imp,
1125                               pixman_op_t              op,
1126                               pixman_image_t *         src_image,
1127                               pixman_image_t *         mask_image,
1128                               pixman_image_t *         dst_image,
1129                               int32_t                  src_x,
1130                               int32_t                  src_y,
1131                               int32_t                  mask_x,
1132                               int32_t                  mask_y,
1133                               int32_t                  dest_x,
1134                               int32_t                  dest_y,
1135                               int32_t                  width,
1136                               int32_t                  height)
1137 {
1138     uint32_t     src, srca;
1139     uint32_t    *dst, *dst_line;
1140     uint32_t    *mask, *mask_line;
1141     int          mask_stride, dst_stride;
1142     uint32_t     bitcache, bitmask;
1143     int32_t      w;
1144
1145     if (width <= 0)
1146         return;
1147
1148     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
1149     srca = src >> 24;
1150     if (src == 0)
1151         return;
1152
1153     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t,
1154                            dst_stride, dst_line, 1);
1155     PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
1156                            mask_stride, mask_line, 1);
1157     mask_line += mask_x >> 5;
1158
1159     if (srca == 0xff)
1160     {
1161         while (height--)
1162         {
1163             dst = dst_line;
1164             dst_line += dst_stride;
1165             mask = mask_line;
1166             mask_line += mask_stride;
1167             w = width;
1168
1169             bitcache = *mask++;
1170             bitmask = CREATE_BITMASK (mask_x & 31);
1171
1172             while (w--)
1173             {
1174                 if (bitmask == 0)
1175                 {
1176                     bitcache = *mask++;
1177                     bitmask = CREATE_BITMASK (0);
1178                 }
1179                 if (bitcache & bitmask)
1180                     *dst = src;
1181                 bitmask = UPDATE_BITMASK (bitmask);
1182                 dst++;
1183             }
1184         }
1185     }
1186     else
1187     {
1188         while (height--)
1189         {
1190             dst = dst_line;
1191             dst_line += dst_stride;
1192             mask = mask_line;
1193             mask_line += mask_stride;
1194             w = width;
1195
1196             bitcache = *mask++;
1197             bitmask = CREATE_BITMASK (mask_x & 31);
1198
1199             while (w--)
1200             {
1201                 if (bitmask == 0)
1202                 {
1203                     bitcache = *mask++;
1204                     bitmask = CREATE_BITMASK (0);
1205                 }
1206                 if (bitcache & bitmask)
1207                     *dst = over (src, *dst);
1208                 bitmask = UPDATE_BITMASK (bitmask);
1209                 dst++;
1210             }
1211         }
1212     }
1213 }
1214
1215 static void
1216 fast_composite_over_n_1_0565 (pixman_implementation_t *imp,
1217                               pixman_op_t              op,
1218                               pixman_image_t *         src_image,
1219                               pixman_image_t *         mask_image,
1220                               pixman_image_t *         dst_image,
1221                               int32_t                  src_x,
1222                               int32_t                  src_y,
1223                               int32_t                  mask_x,
1224                               int32_t                  mask_y,
1225                               int32_t                  dest_x,
1226                               int32_t                  dest_y,
1227                               int32_t                  width,
1228                               int32_t                  height)
1229 {
1230     uint32_t     src, srca;
1231     uint16_t    *dst, *dst_line;
1232     uint32_t    *mask, *mask_line;
1233     int          mask_stride, dst_stride;
1234     uint32_t     bitcache, bitmask;
1235     int32_t      w;
1236     uint32_t     d;
1237     uint16_t     src565;
1238
1239     if (width <= 0)
1240         return;
1241
1242     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
1243     srca = src >> 24;
1244     if (src == 0)
1245         return;
1246
1247     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t,
1248                            dst_stride, dst_line, 1);
1249     PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
1250                            mask_stride, mask_line, 1);
1251     mask_line += mask_x >> 5;
1252
1253     if (srca == 0xff)
1254     {
1255         src565 = CONVERT_8888_TO_0565 (src);
1256         while (height--)
1257         {
1258             dst = dst_line;
1259             dst_line += dst_stride;
1260             mask = mask_line;
1261             mask_line += mask_stride;
1262             w = width;
1263
1264             bitcache = *mask++;
1265             bitmask = CREATE_BITMASK (mask_x & 31);
1266
1267             while (w--)
1268             {
1269                 if (bitmask == 0)
1270                 {
1271                     bitcache = *mask++;
1272                     bitmask = CREATE_BITMASK (0);
1273                 }
1274                 if (bitcache & bitmask)
1275                     *dst = src565;
1276                 bitmask = UPDATE_BITMASK (bitmask);
1277                 dst++;
1278             }
1279         }
1280     }
1281     else
1282     {
1283         while (height--)
1284         {
1285             dst = dst_line;
1286             dst_line += dst_stride;
1287             mask = mask_line;
1288             mask_line += mask_stride;
1289             w = width;
1290
1291             bitcache = *mask++;
1292             bitmask = CREATE_BITMASK (mask_x & 31);
1293
1294             while (w--)
1295             {
1296                 if (bitmask == 0)
1297                 {
1298                     bitcache = *mask++;
1299                     bitmask = CREATE_BITMASK (0);
1300                 }
1301                 if (bitcache & bitmask)
1302                 {
1303                     d = over (src, CONVERT_0565_TO_0888 (*dst));
1304                     *dst = CONVERT_8888_TO_0565 (d);
1305                 }
1306                 bitmask = UPDATE_BITMASK (bitmask);
1307                 dst++;
1308             }
1309         }
1310     }
1311 }
1312
1313 /*
1314  * Simple bitblt
1315  */
1316
1317 static void
1318 fast_composite_solid_fill (pixman_implementation_t *imp,
1319                            pixman_op_t              op,
1320                            pixman_image_t *         src_image,
1321                            pixman_image_t *         mask_image,
1322                            pixman_image_t *         dst_image,
1323                            int32_t                  src_x,
1324                            int32_t                  src_y,
1325                            int32_t                  mask_x,
1326                            int32_t                  mask_y,
1327                            int32_t                  dest_x,
1328                            int32_t                  dest_y,
1329                            int32_t                  width,
1330                            int32_t                  height)
1331 {
1332     uint32_t src;
1333
1334     src = _pixman_image_get_solid (imp, src_image, dst_image->bits.format);
1335
1336     if (dst_image->bits.format == PIXMAN_a1)
1337     {
1338         src = src >> 31;
1339     }
1340     else if (dst_image->bits.format == PIXMAN_a8)
1341     {
1342         src = src >> 24;
1343     }
1344     else if (dst_image->bits.format == PIXMAN_r5g6b5 ||
1345              dst_image->bits.format == PIXMAN_b5g6r5)
1346     {
1347         src = CONVERT_8888_TO_0565 (src);
1348     }
1349
1350     pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
1351                  PIXMAN_FORMAT_BPP (dst_image->bits.format),
1352                  dest_x, dest_y,
1353                  width, height,
1354                  src);
1355 }
1356
1357 static void
1358 fast_composite_src_memcpy (pixman_implementation_t *imp,
1359                            pixman_op_t              op,
1360                            pixman_image_t *         src_image,
1361                            pixman_image_t *         mask_image,
1362                            pixman_image_t *         dst_image,
1363                            int32_t                  src_x,
1364                            int32_t                  src_y,
1365                            int32_t                  mask_x,
1366                            int32_t                  mask_y,
1367                            int32_t                  dest_x,
1368                            int32_t                  dest_y,
1369                            int32_t                  width,
1370                            int32_t                  height)
1371 {
1372     int bpp = PIXMAN_FORMAT_BPP (dst_image->bits.format) / 8;
1373     uint32_t n_bytes = width * bpp;
1374     int dst_stride, src_stride;
1375     uint8_t    *dst;
1376     uint8_t    *src;
1377
1378     src_stride = src_image->bits.rowstride * 4;
1379     dst_stride = dst_image->bits.rowstride * 4;
1380
1381     src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp;
1382     dst = (uint8_t *)dst_image->bits.bits + dest_y * dst_stride + dest_x * bpp;
1383
1384     while (height--)
1385     {
1386         memcpy (dst, src, n_bytes);
1387
1388         dst += dst_stride;
1389         src += src_stride;
1390     }
1391 }
1392
1393 FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER)
1394 FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE)
1395 FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD)
1396 FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL)
1397 FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER)
1398 FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE)
1399 FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD)
1400 FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL)
1401 FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER)
1402 FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE)
1403 FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD)
1404 FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL)
1405 FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL)
1406 FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER)
1407 FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE)
1408 FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD)
1409 FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL)
1410
1411 /* Use more unrolling for src_0565_0565 because it is typically CPU bound */
1412 static force_inline void
1413 scaled_nearest_scanline_565_565_SRC (uint16_t *       dst,
1414                                      const uint16_t * src,
1415                                      int32_t          w,
1416                                      pixman_fixed_t   vx,
1417                                      pixman_fixed_t   unit_x,
1418                                      pixman_fixed_t   max_vx,
1419                                      pixman_bool_t    fully_transparent_src)
1420 {
1421     uint16_t tmp1, tmp2, tmp3, tmp4;
1422     while ((w -= 4) >= 0)
1423     {
1424         tmp1 = src[pixman_fixed_to_int (vx)];
1425         vx += unit_x;
1426         tmp2 = src[pixman_fixed_to_int (vx)];
1427         vx += unit_x;
1428         tmp3 = src[pixman_fixed_to_int (vx)];
1429         vx += unit_x;
1430         tmp4 = src[pixman_fixed_to_int (vx)];
1431         vx += unit_x;
1432         *dst++ = tmp1;
1433         *dst++ = tmp2;
1434         *dst++ = tmp3;
1435         *dst++ = tmp4;
1436     }
1437     if (w & 2)
1438     {
1439         tmp1 = src[pixman_fixed_to_int (vx)];
1440         vx += unit_x;
1441         tmp2 = src[pixman_fixed_to_int (vx)];
1442         vx += unit_x;
1443         *dst++ = tmp1;
1444         *dst++ = tmp2;
1445     }
1446     if (w & 1)
1447         *dst++ = src[pixman_fixed_to_int (vx)];
1448 }
1449
1450 FAST_NEAREST_MAINLOOP (565_565_cover_SRC,
1451                        scaled_nearest_scanline_565_565_SRC,
1452                        uint16_t, uint16_t, COVER)
1453 FAST_NEAREST_MAINLOOP (565_565_none_SRC,
1454                        scaled_nearest_scanline_565_565_SRC,
1455                        uint16_t, uint16_t, NONE)
1456 FAST_NEAREST_MAINLOOP (565_565_pad_SRC,
1457                        scaled_nearest_scanline_565_565_SRC,
1458                        uint16_t, uint16_t, PAD)
1459
1460 static force_inline uint32_t
1461 fetch_nearest (pixman_repeat_t src_repeat,
1462                pixman_format_code_t format,
1463                uint32_t *src, int x, int src_width)
1464 {
1465     if (repeat (src_repeat, &x, src_width))
1466     {
1467         if (format == PIXMAN_x8r8g8b8)
1468             return *(src + x) | 0xff000000;
1469         else
1470             return *(src + x);
1471     }
1472     else
1473     {
1474         return 0;
1475     }
1476 }
1477
1478 static force_inline void
1479 combine_over (uint32_t s, uint32_t *dst)
1480 {
1481     if (s)
1482     {
1483         uint8_t ia = 0xff - (s >> 24);
1484
1485         if (ia)
1486             UN8x4_MUL_UN8_ADD_UN8x4 (*dst, ia, s);
1487         else
1488             *dst = s;
1489     }
1490 }
1491
1492 static force_inline void
1493 combine_src (uint32_t s, uint32_t *dst)
1494 {
1495     *dst = s;
1496 }
1497
1498 static void
1499 fast_composite_scaled_nearest (pixman_implementation_t *imp,
1500                                pixman_op_t              op,
1501                                pixman_image_t *         src_image,
1502                                pixman_image_t *         mask_image,
1503                                pixman_image_t *         dst_image,
1504                                int32_t                  src_x,
1505                                int32_t                  src_y,
1506                                int32_t                  mask_x,
1507                                int32_t                  mask_y,
1508                                int32_t                  dest_x,
1509                                int32_t                  dest_y,
1510                                int32_t                  width,
1511                                int32_t                  height)
1512 {
1513     uint32_t       *dst_line;
1514     uint32_t       *src_line;
1515     int             dst_stride, src_stride;
1516     int             src_width, src_height;
1517     pixman_repeat_t src_repeat;
1518     pixman_fixed_t unit_x, unit_y;
1519     pixman_format_code_t src_format;
1520     pixman_vector_t v;
1521     pixman_fixed_t vy;
1522
1523     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1524     /* pass in 0 instead of src_x and src_y because src_x and src_y need to be
1525      * transformed from destination space to source space
1526      */
1527     PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, uint32_t, src_stride, src_line, 1);
1528
1529     /* reference point is the center of the pixel */
1530     v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;
1531     v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;
1532     v.vector[2] = pixman_fixed_1;
1533
1534     if (!pixman_transform_point_3d (src_image->common.transform, &v))
1535         return;
1536
1537     unit_x = src_image->common.transform->matrix[0][0];
1538     unit_y = src_image->common.transform->matrix[1][1];
1539
1540     /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */
1541     v.vector[0] -= pixman_fixed_e;
1542     v.vector[1] -= pixman_fixed_e;
1543
1544     src_height = src_image->bits.height;
1545     src_width = src_image->bits.width;
1546     src_repeat = src_image->common.repeat;
1547     src_format = src_image->bits.format;
1548
1549     vy = v.vector[1];
1550     while (height--)
1551     {
1552         pixman_fixed_t vx = v.vector[0];
1553         int y = pixman_fixed_to_int (vy);
1554         uint32_t *dst = dst_line;
1555
1556         dst_line += dst_stride;
1557
1558         /* adjust the y location by a unit vector in the y direction
1559          * this is equivalent to transforming y+1 of the destination point to source space */
1560         vy += unit_y;
1561
1562         if (!repeat (src_repeat, &y, src_height))
1563         {
1564             if (op == PIXMAN_OP_SRC)
1565                 memset (dst, 0, sizeof (*dst) * width);
1566         }
1567         else
1568         {
1569             int w = width;
1570
1571             uint32_t *src = src_line + y * src_stride;
1572
1573             while (w >= 2)
1574             {
1575                 uint32_t s1, s2;
1576                 int x1, x2;
1577
1578                 x1 = pixman_fixed_to_int (vx);
1579                 vx += unit_x;
1580
1581                 x2 = pixman_fixed_to_int (vx);
1582                 vx += unit_x;
1583
1584                 w -= 2;
1585
1586                 s1 = fetch_nearest (src_repeat, src_format, src, x1, src_width);
1587                 s2 = fetch_nearest (src_repeat, src_format, src, x2, src_width);
1588
1589                 if (op == PIXMAN_OP_OVER)
1590                 {
1591                     combine_over (s1, dst++);
1592                     combine_over (s2, dst++);
1593                 }
1594                 else
1595                 {
1596                     combine_src (s1, dst++);
1597                     combine_src (s2, dst++);
1598                 }
1599             }
1600
1601             while (w--)
1602             {
1603                 uint32_t s;
1604                 int x;
1605
1606                 x = pixman_fixed_to_int (vx);
1607                 vx += unit_x;
1608
1609                 s = fetch_nearest (src_repeat, src_format, src, x, src_width);
1610
1611                 if (op == PIXMAN_OP_OVER)
1612                     combine_over (s, dst++);
1613                 else
1614                     combine_src (s, dst++);
1615             }
1616         }
1617     }
1618 }
1619
1620 #define CACHE_LINE_SIZE 64
1621
1622 #define FAST_SIMPLE_ROTATE(suffix, pix_type)                                  \
1623                                                                               \
1624 static void                                                                   \
1625 blt_rotated_90_trivial_##suffix (pix_type       *dst,                         \
1626                                  int             dst_stride,                  \
1627                                  const pix_type *src,                         \
1628                                  int             src_stride,                  \
1629                                  int             w,                           \
1630                                  int             h)                           \
1631 {                                                                             \
1632     int x, y;                                                                 \
1633     for (y = 0; y < h; y++)                                                   \
1634     {                                                                         \
1635         const pix_type *s = src + (h - y - 1);                                \
1636         pix_type *d = dst + dst_stride * y;                                   \
1637         for (x = 0; x < w; x++)                                               \
1638         {                                                                     \
1639             *d++ = *s;                                                        \
1640             s += src_stride;                                                  \
1641         }                                                                     \
1642     }                                                                         \
1643 }                                                                             \
1644                                                                               \
1645 static void                                                                   \
1646 blt_rotated_270_trivial_##suffix (pix_type       *dst,                        \
1647                                   int             dst_stride,                 \
1648                                   const pix_type *src,                        \
1649                                   int             src_stride,                 \
1650                                   int             w,                          \
1651                                   int             h)                          \
1652 {                                                                             \
1653     int x, y;                                                                 \
1654     for (y = 0; y < h; y++)                                                   \
1655     {                                                                         \
1656         const pix_type *s = src + src_stride * (w - 1) + y;                   \
1657         pix_type *d = dst + dst_stride * y;                                   \
1658         for (x = 0; x < w; x++)                                               \
1659         {                                                                     \
1660             *d++ = *s;                                                        \
1661             s -= src_stride;                                                  \
1662         }                                                                     \
1663     }                                                                         \
1664 }                                                                             \
1665                                                                               \
1666 static void                                                                   \
1667 blt_rotated_90_##suffix (pix_type       *dst,                                 \
1668                          int             dst_stride,                          \
1669                          const pix_type *src,                                 \
1670                          int             src_stride,                          \
1671                          int             W,                                   \
1672                          int             H)                                   \
1673 {                                                                             \
1674     int x;                                                                    \
1675     int leading_pixels = 0, trailing_pixels = 0;                              \
1676     const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type);                 \
1677                                                                               \
1678     /*                                                                        \
1679      * split processing into handling destination as TILE_SIZExH cache line   \
1680      * aligned vertical stripes (optimistically assuming that destination     \
1681      * stride is a multiple of cache line, if not - it will be just a bit     \
1682      * slower)                                                                \
1683      */                                                                       \
1684                                                                               \
1685     if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1))                               \
1686     {                                                                         \
1687         leading_pixels = TILE_SIZE - (((uintptr_t)dst &                       \
1688                             (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
1689         if (leading_pixels > W)                                               \
1690             leading_pixels = W;                                               \
1691                                                                               \
1692         /* unaligned leading part NxH (where N < TILE_SIZE) */                \
1693         blt_rotated_90_trivial_##suffix (                                     \
1694             dst,                                                              \
1695             dst_stride,                                                       \
1696             src,                                                              \
1697             src_stride,                                                       \
1698             leading_pixels,                                                   \
1699             H);                                                               \
1700                                                                               \
1701         dst += leading_pixels;                                                \
1702         src += leading_pixels * src_stride;                                   \
1703         W -= leading_pixels;                                                  \
1704     }                                                                         \
1705                                                                               \
1706     if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1))                         \
1707     {                                                                         \
1708         trailing_pixels = (((uintptr_t)(dst + W) &                            \
1709                             (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
1710         if (trailing_pixels > W)                                              \
1711             trailing_pixels = W;                                              \
1712         W -= trailing_pixels;                                                 \
1713     }                                                                         \
1714                                                                               \
1715     for (x = 0; x < W; x += TILE_SIZE)                                        \
1716     {                                                                         \
1717         /* aligned middle part TILE_SIZExH */                                 \
1718         blt_rotated_90_trivial_##suffix (                                     \
1719             dst + x,                                                          \
1720             dst_stride,                                                       \
1721             src + src_stride * x,                                             \
1722             src_stride,                                                       \
1723             TILE_SIZE,                                                        \
1724             H);                                                               \
1725     }                                                                         \
1726                                                                               \
1727     if (trailing_pixels)                                                      \
1728     {                                                                         \
1729         /* unaligned trailing part NxH (where N < TILE_SIZE) */               \
1730         blt_rotated_90_trivial_##suffix (                                     \
1731             dst + W,                                                          \
1732             dst_stride,                                                       \
1733             src + W * src_stride,                                             \
1734             src_stride,                                                       \
1735             trailing_pixels,                                                  \
1736             H);                                                               \
1737     }                                                                         \
1738 }                                                                             \
1739                                                                               \
1740 static void                                                                   \
1741 blt_rotated_270_##suffix (pix_type       *dst,                                \
1742                           int             dst_stride,                         \
1743                           const pix_type *src,                                \
1744                           int             src_stride,                         \
1745                           int             W,                                  \
1746                           int             H)                                  \
1747 {                                                                             \
1748     int x;                                                                    \
1749     int leading_pixels = 0, trailing_pixels = 0;                              \
1750     const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type);                 \
1751                                                                               \
1752     /*                                                                        \
1753      * split processing into handling destination as TILE_SIZExH cache line   \
1754      * aligned vertical stripes (optimistically assuming that destination     \
1755      * stride is a multiple of cache line, if not - it will be just a bit     \
1756      * slower)                                                                \
1757      */                                                                       \
1758                                                                               \
1759     if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1))                               \
1760     {                                                                         \
1761         leading_pixels = TILE_SIZE - (((uintptr_t)dst &                       \
1762                             (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
1763         if (leading_pixels > W)                                               \
1764             leading_pixels = W;                                               \
1765                                                                               \
1766         /* unaligned leading part NxH (where N < TILE_SIZE) */                \
1767         blt_rotated_270_trivial_##suffix (                                    \
1768             dst,                                                              \
1769             dst_stride,                                                       \
1770             src + src_stride * (W - leading_pixels),                          \
1771             src_stride,                                                       \
1772             leading_pixels,                                                   \
1773             H);                                                               \
1774                                                                               \
1775         dst += leading_pixels;                                                \
1776         W -= leading_pixels;                                                  \
1777     }                                                                         \
1778                                                                               \
1779     if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1))                         \
1780     {                                                                         \
1781         trailing_pixels = (((uintptr_t)(dst + W) &                            \
1782                             (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
1783         if (trailing_pixels > W)                                              \
1784             trailing_pixels = W;                                              \
1785         W -= trailing_pixels;                                                 \
1786         src += trailing_pixels * src_stride;                                  \
1787     }                                                                         \
1788                                                                               \
1789     for (x = 0; x < W; x += TILE_SIZE)                                        \
1790     {                                                                         \
1791         /* aligned middle part TILE_SIZExH */                                 \
1792         blt_rotated_270_trivial_##suffix (                                    \
1793             dst + x,                                                          \
1794             dst_stride,                                                       \
1795             src + src_stride * (W - x - TILE_SIZE),                           \
1796             src_stride,                                                       \
1797             TILE_SIZE,                                                        \
1798             H);                                                               \
1799     }                                                                         \
1800                                                                               \
1801     if (trailing_pixels)                                                      \
1802     {                                                                         \
1803         /* unaligned trailing part NxH (where N < TILE_SIZE) */               \
1804         blt_rotated_270_trivial_##suffix (                                    \
1805             dst + W,                                                          \
1806             dst_stride,                                                       \
1807             src - trailing_pixels * src_stride,                               \
1808             src_stride,                                                       \
1809             trailing_pixels,                                                  \
1810             H);                                                               \
1811     }                                                                         \
1812 }                                                                             \
1813                                                                               \
1814 static void                                                                   \
1815 fast_composite_rotate_90_##suffix (pixman_implementation_t *imp,              \
1816                                    pixman_op_t              op,               \
1817                                    pixman_image_t *         src_image,        \
1818                                    pixman_image_t *         mask_image,       \
1819                                    pixman_image_t *         dst_image,        \
1820                                    int32_t                  src_x,            \
1821                                    int32_t                  src_y,            \
1822                                    int32_t                  mask_x,           \
1823                                    int32_t                  mask_y,           \
1824                                    int32_t                  dest_x,           \
1825                                    int32_t                  dest_y,           \
1826                                    int32_t                  width,            \
1827                                    int32_t                  height)           \
1828 {                                                                             \
1829     pix_type       *dst_line;                                                 \
1830     pix_type       *src_line;                                                 \
1831     int             dst_stride, src_stride;                                   \
1832     int             src_x_t, src_y_t;                                         \
1833                                                                               \
1834     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, pix_type,               \
1835                            dst_stride, dst_line, 1);                          \
1836     src_x_t = -src_y + pixman_fixed_to_int (                                  \
1837                                 src_image->common.transform->matrix[0][2] +   \
1838                                 pixman_fixed_1 / 2 - pixman_fixed_e) - height;\
1839     src_y_t = src_x + pixman_fixed_to_int (                                   \
1840                                 src_image->common.transform->matrix[1][2] +   \
1841                                 pixman_fixed_1 / 2 - pixman_fixed_e);         \
1842     PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type,             \
1843                            src_stride, src_line, 1);                          \
1844     blt_rotated_90_##suffix (dst_line, dst_stride, src_line, src_stride,      \
1845                              width, height);                                  \
1846 }                                                                             \
1847                                                                               \
1848 static void                                                                   \
1849 fast_composite_rotate_270_##suffix (pixman_implementation_t *imp,             \
1850                                     pixman_op_t              op,              \
1851                                     pixman_image_t *         src_image,       \
1852                                     pixman_image_t *         mask_image,      \
1853                                     pixman_image_t *         dst_image,       \
1854                                     int32_t                  src_x,           \
1855                                     int32_t                  src_y,           \
1856                                     int32_t                  mask_x,          \
1857                                     int32_t                  mask_y,          \
1858                                     int32_t                  dest_x,          \
1859                                     int32_t                  dest_y,          \
1860                                     int32_t                  width,           \
1861                                     int32_t                  height)          \
1862 {                                                                             \
1863     pix_type       *dst_line;                                                 \
1864     pix_type       *src_line;                                                 \
1865     int             dst_stride, src_stride;                                   \
1866     int             src_x_t, src_y_t;                                         \
1867                                                                               \
1868     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, pix_type,               \
1869                            dst_stride, dst_line, 1);                          \
1870     src_x_t = src_y + pixman_fixed_to_int (                                   \
1871                                 src_image->common.transform->matrix[0][2] +   \
1872                                 pixman_fixed_1 / 2 - pixman_fixed_e);         \
1873     src_y_t = -src_x + pixman_fixed_to_int (                                  \
1874                                 src_image->common.transform->matrix[1][2] +   \
1875                                 pixman_fixed_1 / 2 - pixman_fixed_e) - width; \
1876     PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type,             \
1877                            src_stride, src_line, 1);                          \
1878     blt_rotated_270_##suffix (dst_line, dst_stride, src_line, src_stride,     \
1879                               width, height);                                 \
1880 }
1881
1882 FAST_SIMPLE_ROTATE (8, uint8_t)
1883 FAST_SIMPLE_ROTATE (565, uint16_t)
1884 FAST_SIMPLE_ROTATE (8888, uint32_t)
1885
1886 static const pixman_fast_path_t c_fast_paths[] =
1887 {
1888     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, fast_composite_over_n_8_0565),
1889     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, fast_composite_over_n_8_0565),
1890     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, fast_composite_over_n_8_0888),
1891     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, fast_composite_over_n_8_0888),
1892     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, fast_composite_over_n_8_8888),
1893     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, fast_composite_over_n_8_8888),
1894     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, fast_composite_over_n_8_8888),
1895     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, fast_composite_over_n_8_8888),
1896     PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, fast_composite_over_n_1_8888),
1897     PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, fast_composite_over_n_1_8888),
1898     PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, fast_composite_over_n_1_8888),
1899     PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, fast_composite_over_n_1_8888),
1900     PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5,   fast_composite_over_n_1_0565),
1901     PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5,   fast_composite_over_n_1_0565),
1902     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, fast_composite_over_n_8888_8888_ca),
1903     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, fast_composite_over_n_8888_8888_ca),
1904     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, fast_composite_over_n_8888_0565_ca),
1905     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, fast_composite_over_n_8888_8888_ca),
1906     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, fast_composite_over_n_8888_8888_ca),
1907     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, fast_composite_over_n_8888_0565_ca),
1908     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, fast_composite_over_x888_8_8888),
1909     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, fast_composite_over_x888_8_8888),
1910     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, fast_composite_over_x888_8_8888),
1911     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, fast_composite_over_x888_8_8888),
1912     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, fast_composite_over_8888_8888),
1913     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, fast_composite_over_8888_8888),
1914     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, fast_composite_over_8888_0565),
1915     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, fast_composite_over_8888_8888),
1916     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, fast_composite_over_8888_8888),
1917     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565),
1918     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888),
1919     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888),
1920     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8),
1921     PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1000_1000),
1922     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca),
1923     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8),
1924     PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, fast_composite_solid_fill),
1925     PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, fast_composite_solid_fill),
1926     PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, fast_composite_solid_fill),
1927     PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, fast_composite_solid_fill),
1928     PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, fast_composite_solid_fill),
1929     PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, fast_composite_solid_fill),
1930     PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, fast_composite_solid_fill),
1931     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, fast_composite_src_x888_8888),
1932     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, fast_composite_src_x888_8888),
1933     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
1934     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, fast_composite_src_memcpy),
1935     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
1936     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
1937     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, fast_composite_src_memcpy),
1938     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
1939     PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8x8, fast_composite_src_memcpy),
1940     PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8a8, fast_composite_src_memcpy),
1941     PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, fast_composite_src_memcpy),
1942     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, fast_composite_src_memcpy),
1943     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, fast_composite_src_memcpy),
1944     PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, fast_composite_src_memcpy),
1945     PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, fast_composite_src_memcpy),
1946     PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
1947     PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
1948     PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy),
1949     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
1950     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
1951     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
1952     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
1953     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8),
1954     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8),
1955
1956     SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888_8888),
1957     SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888_8888),
1958     SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, 8888_8888),
1959     SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, 8888_8888),
1960
1961     SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888_8888),
1962     SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, 8888_8888),
1963
1964     SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, 8888_565),
1965     SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, 8888_565),
1966
1967     SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565),
1968
1969     SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, 8888_8888),
1970     SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, 8888_8888),
1971     SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, 8888_8888),
1972     SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, 8888_8888),
1973
1974     SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565),
1975
1976 #define NEAREST_FAST_PATH(op,s,d)               \
1977     {   PIXMAN_OP_ ## op,                       \
1978         PIXMAN_ ## s, SCALED_NEAREST_FLAGS,     \
1979         PIXMAN_null, 0,                         \
1980         PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
1981         fast_composite_scaled_nearest,          \
1982     }
1983
1984     NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8),
1985     NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8),
1986     NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8),
1987     NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8),
1988
1989     NEAREST_FAST_PATH (SRC, x8r8g8b8, a8r8g8b8),
1990     NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8),
1991     NEAREST_FAST_PATH (SRC, x8b8g8r8, a8b8g8r8),
1992     NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8),
1993
1994     NEAREST_FAST_PATH (OVER, x8r8g8b8, x8r8g8b8),
1995     NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8),
1996     NEAREST_FAST_PATH (OVER, x8b8g8r8, x8b8g8r8),
1997     NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8),
1998
1999     NEAREST_FAST_PATH (OVER, x8r8g8b8, a8r8g8b8),
2000     NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8),
2001     NEAREST_FAST_PATH (OVER, x8b8g8r8, a8b8g8r8),
2002     NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8),
2003
2004 #define SIMPLE_ROTATE_FLAGS(angle)                                        \
2005     (FAST_PATH_ROTATE_ ## angle ## _TRANSFORM   |                         \
2006      FAST_PATH_NEAREST_FILTER                   |                         \
2007      FAST_PATH_SAMPLES_COVER_CLIP               |                         \
2008      FAST_PATH_STANDARD_FLAGS)
2009
2010 #define SIMPLE_ROTATE_FAST_PATH(op,s,d,suffix)                            \
2011     {   PIXMAN_OP_ ## op,                                                 \
2012         PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (90),                           \
2013         PIXMAN_null, 0,                                                   \
2014         PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                           \
2015         fast_composite_rotate_90_##suffix,                                \
2016     },                                                                    \
2017     {   PIXMAN_OP_ ## op,                                                 \
2018         PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (270),                          \
2019         PIXMAN_null, 0,                                                   \
2020         PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                           \
2021         fast_composite_rotate_270_##suffix,                               \
2022     }
2023
2024     SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888),
2025     SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888),
2026     SIMPLE_ROTATE_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888),
2027     SIMPLE_ROTATE_FAST_PATH (SRC, r5g6b5, r5g6b5, 565),
2028     SIMPLE_ROTATE_FAST_PATH (SRC, a8, a8, 8),
2029
2030     {   PIXMAN_OP_NONE  },
2031 };
2032
2033 #ifdef WORDS_BIGENDIAN
2034 #define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (32 - (offs) - (n)))
2035 #else
2036 #define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (offs))
2037 #endif
2038
2039 static force_inline void
2040 pixman_fill1_line (uint32_t *dst, int offs, int width, int v)
2041 {
2042     if (offs)
2043     {
2044         int leading_pixels = 32 - offs;
2045         if (leading_pixels >= width)
2046         {
2047             if (v)
2048                 *dst |= A1_FILL_MASK (width, offs);
2049             else
2050                 *dst &= ~A1_FILL_MASK (width, offs);
2051             return;
2052         }
2053         else
2054         {
2055             if (v)
2056                 *dst++ |= A1_FILL_MASK (leading_pixels, offs);
2057             else
2058                 *dst++ &= ~A1_FILL_MASK (leading_pixels, offs);
2059             width -= leading_pixels;
2060         }
2061     }
2062     while (width >= 32)
2063     {
2064         if (v)
2065             *dst++ = 0xFFFFFFFF;
2066         else
2067             *dst++ = 0;
2068         width -= 32;
2069     }
2070     if (width > 0)
2071     {
2072         if (v)
2073             *dst |= A1_FILL_MASK (width, 0);
2074         else
2075             *dst &= ~A1_FILL_MASK (width, 0);
2076     }
2077 }
2078
2079 static void
2080 pixman_fill1 (uint32_t *bits,
2081               int       stride,
2082               int       x,
2083               int       y,
2084               int       width,
2085               int       height,
2086               uint32_t  xor)
2087 {
2088     uint32_t *dst = bits + y * stride + (x >> 5);
2089     int offs = x & 31;
2090
2091     if (xor & 1)
2092     {
2093         while (height--)
2094         {
2095             pixman_fill1_line (dst, offs, width, 1);
2096             dst += stride;
2097         }
2098     }
2099     else
2100     {
2101         while (height--)
2102         {
2103             pixman_fill1_line (dst, offs, width, 0);
2104             dst += stride;
2105         }
2106     }
2107 }
2108
2109 static void
2110 pixman_fill8 (uint32_t *bits,
2111               int       stride,
2112               int       x,
2113               int       y,
2114               int       width,
2115               int       height,
2116               uint32_t xor)
2117 {
2118     int byte_stride = stride * (int) sizeof (uint32_t);
2119     uint8_t *dst = (uint8_t *) bits;
2120     uint8_t v = xor & 0xff;
2121     int i;
2122
2123     dst = dst + y * byte_stride + x;
2124
2125     while (height--)
2126     {
2127         for (i = 0; i < width; ++i)
2128             dst[i] = v;
2129
2130         dst += byte_stride;
2131     }
2132 }
2133
2134 static void
2135 pixman_fill16 (uint32_t *bits,
2136                int       stride,
2137                int       x,
2138                int       y,
2139                int       width,
2140                int       height,
2141                uint32_t xor)
2142 {
2143     int short_stride =
2144         (stride * (int)sizeof (uint32_t)) / (int)sizeof (uint16_t);
2145     uint16_t *dst = (uint16_t *)bits;
2146     uint16_t v = xor & 0xffff;
2147     int i;
2148
2149     dst = dst + y * short_stride + x;
2150
2151     while (height--)
2152     {
2153         for (i = 0; i < width; ++i)
2154             dst[i] = v;
2155
2156         dst += short_stride;
2157     }
2158 }
2159
2160 static void
2161 pixman_fill32 (uint32_t *bits,
2162                int       stride,
2163                int       x,
2164                int       y,
2165                int       width,
2166                int       height,
2167                uint32_t  xor)
2168 {
2169     int i;
2170
2171     bits = bits + y * stride + x;
2172
2173     while (height--)
2174     {
2175         for (i = 0; i < width; ++i)
2176             bits[i] = xor;
2177
2178         bits += stride;
2179     }
2180 }
2181
2182 static pixman_bool_t
2183 fast_path_fill (pixman_implementation_t *imp,
2184                 uint32_t *               bits,
2185                 int                      stride,
2186                 int                      bpp,
2187                 int                      x,
2188                 int                      y,
2189                 int                      width,
2190                 int                      height,
2191                 uint32_t                 xor)
2192 {
2193     switch (bpp)
2194     {
2195     case 1:
2196         pixman_fill1 (bits, stride, x, y, width, height, xor);
2197         break;
2198
2199     case 8:
2200         pixman_fill8 (bits, stride, x, y, width, height, xor);
2201         break;
2202
2203     case 16:
2204         pixman_fill16 (bits, stride, x, y, width, height, xor);
2205         break;
2206
2207     case 32:
2208         pixman_fill32 (bits, stride, x, y, width, height, xor);
2209         break;
2210
2211     default:
2212         return _pixman_implementation_fill (
2213             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
2214         break;
2215     }
2216
2217     return TRUE;
2218 }
2219
2220 pixman_implementation_t *
2221 _pixman_implementation_create_fast_path (pixman_implementation_t *fallback)
2222 {
2223     pixman_implementation_t *imp = _pixman_implementation_create (fallback, c_fast_paths);
2224
2225     imp->fill = fast_path_fill;
2226
2227     return imp;
2228 }