mmx: add add_0565_0565
[profile/ivi/pixman.git] / pixman / pixman-fast-path.c
1 /* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
2 /*
3  * Copyright © 2000 SuSE, Inc.
4  * Copyright © 2007 Red Hat, Inc.
5  *
6  * Permission to use, copy, modify, distribute, and sell this software and its
7  * documentation for any purpose is hereby granted without fee, provided that
8  * the above copyright notice appear in all copies and that both that
9  * copyright notice and this permission notice appear in supporting
10  * documentation, and that the name of SuSE not be used in advertising or
11  * publicity pertaining to distribution of the software without specific,
12  * written prior permission.  SuSE makes no representations about the
13  * suitability of this software for any purpose.  It is provided "as is"
14  * without express or implied warranty.
15  *
16  * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
18  * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
20  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
21  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22  *
23  * Author:  Keith Packard, SuSE, Inc.
24  */
25
26 #ifdef HAVE_CONFIG_H
27 #include <config.h>
28 #endif
29 #include <string.h>
30 #include <stdlib.h>
31 #include "pixman-private.h"
32 #include "pixman-combine32.h"
33 #include "pixman-inlines.h"
34
35 static force_inline uint32_t
36 fetch_24 (uint8_t *a)
37 {
38     if (((unsigned long)a) & 1)
39     {
40 #ifdef WORDS_BIGENDIAN
41         return (*a << 16) | (*(uint16_t *)(a + 1));
42 #else
43         return *a | (*(uint16_t *)(a + 1) << 8);
44 #endif
45     }
46     else
47     {
48 #ifdef WORDS_BIGENDIAN
49         return (*(uint16_t *)a << 8) | *(a + 2);
50 #else
51         return *(uint16_t *)a | (*(a + 2) << 16);
52 #endif
53     }
54 }
55
56 static force_inline void
57 store_24 (uint8_t *a,
58           uint32_t v)
59 {
60     if (((unsigned long)a) & 1)
61     {
62 #ifdef WORDS_BIGENDIAN
63         *a = (uint8_t) (v >> 16);
64         *(uint16_t *)(a + 1) = (uint16_t) (v);
65 #else
66         *a = (uint8_t) (v);
67         *(uint16_t *)(a + 1) = (uint16_t) (v >> 8);
68 #endif
69     }
70     else
71     {
72 #ifdef WORDS_BIGENDIAN
73         *(uint16_t *)a = (uint16_t)(v >> 8);
74         *(a + 2) = (uint8_t)v;
75 #else
76         *(uint16_t *)a = (uint16_t)v;
77         *(a + 2) = (uint8_t)(v >> 16);
78 #endif
79     }
80 }
81
82 static force_inline uint32_t
83 over (uint32_t src,
84       uint32_t dest)
85 {
86     uint32_t a = ~src >> 24;
87
88     UN8x4_MUL_UN8_ADD_UN8x4 (dest, a, src);
89
90     return dest;
91 }
92
93 static uint32_t
94 in (uint32_t x,
95     uint8_t  y)
96 {
97     uint16_t a = y;
98
99     UN8x4_MUL_UN8 (x, a);
100
101     return x;
102 }
103
104 /*
105  * Naming convention:
106  *
107  *  op_src_mask_dest
108  */
109 static void
110 fast_composite_over_x888_8_8888 (pixman_implementation_t *imp,
111                                  pixman_composite_info_t *info)
112 {
113     PIXMAN_COMPOSITE_ARGS (info);
114     uint32_t    *src, *src_line;
115     uint32_t    *dst, *dst_line;
116     uint8_t     *mask, *mask_line;
117     int src_stride, mask_stride, dst_stride;
118     uint8_t m;
119     uint32_t s, d;
120     int32_t w;
121
122     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
123     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
124     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
125
126     while (height--)
127     {
128         src = src_line;
129         src_line += src_stride;
130         dst = dst_line;
131         dst_line += dst_stride;
132         mask = mask_line;
133         mask_line += mask_stride;
134
135         w = width;
136         while (w--)
137         {
138             m = *mask++;
139             if (m)
140             {
141                 s = *src | 0xff000000;
142
143                 if (m == 0xff)
144                 {
145                     *dst = s;
146                 }
147                 else
148                 {
149                     d = in (s, m);
150                     *dst = over (d, *dst);
151                 }
152             }
153             src++;
154             dst++;
155         }
156     }
157 }
158
159 static void
160 fast_composite_in_n_8_8 (pixman_implementation_t *imp,
161                          pixman_composite_info_t *info)
162 {
163     PIXMAN_COMPOSITE_ARGS (info);
164     uint32_t src, srca;
165     uint8_t     *dst_line, *dst;
166     uint8_t     *mask_line, *mask, m;
167     int dst_stride, mask_stride;
168     int32_t w;
169     uint16_t t;
170
171     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
172
173     srca = src >> 24;
174
175     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
176     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
177
178     if (srca == 0xff)
179     {
180         while (height--)
181         {
182             dst = dst_line;
183             dst_line += dst_stride;
184             mask = mask_line;
185             mask_line += mask_stride;
186             w = width;
187
188             while (w--)
189             {
190                 m = *mask++;
191
192                 if (m == 0)
193                     *dst = 0;
194                 else if (m != 0xff)
195                     *dst = MUL_UN8 (m, *dst, t);
196
197                 dst++;
198             }
199         }
200     }
201     else
202     {
203         while (height--)
204         {
205             dst = dst_line;
206             dst_line += dst_stride;
207             mask = mask_line;
208             mask_line += mask_stride;
209             w = width;
210
211             while (w--)
212             {
213                 m = *mask++;
214                 m = MUL_UN8 (m, srca, t);
215
216                 if (m == 0)
217                     *dst = 0;
218                 else if (m != 0xff)
219                     *dst = MUL_UN8 (m, *dst, t);
220
221                 dst++;
222             }
223         }
224     }
225 }
226
227 static void
228 fast_composite_in_8_8 (pixman_implementation_t *imp,
229                        pixman_composite_info_t *info)
230 {
231     PIXMAN_COMPOSITE_ARGS (info);
232     uint8_t     *dst_line, *dst;
233     uint8_t     *src_line, *src;
234     int dst_stride, src_stride;
235     int32_t w;
236     uint8_t s;
237     uint16_t t;
238
239     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
240     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
241
242     while (height--)
243     {
244         dst = dst_line;
245         dst_line += dst_stride;
246         src = src_line;
247         src_line += src_stride;
248         w = width;
249
250         while (w--)
251         {
252             s = *src++;
253
254             if (s == 0)
255                 *dst = 0;
256             else if (s != 0xff)
257                 *dst = MUL_UN8 (s, *dst, t);
258
259             dst++;
260         }
261     }
262 }
263
264 static void
265 fast_composite_over_n_8_8888 (pixman_implementation_t *imp,
266                               pixman_composite_info_t *info)
267 {
268     PIXMAN_COMPOSITE_ARGS (info);
269     uint32_t src, srca;
270     uint32_t    *dst_line, *dst, d;
271     uint8_t     *mask_line, *mask, m;
272     int dst_stride, mask_stride;
273     int32_t w;
274
275     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
276
277     srca = src >> 24;
278     if (src == 0)
279         return;
280
281     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
282     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
283
284     while (height--)
285     {
286         dst = dst_line;
287         dst_line += dst_stride;
288         mask = mask_line;
289         mask_line += mask_stride;
290         w = width;
291
292         while (w--)
293         {
294             m = *mask++;
295             if (m == 0xff)
296             {
297                 if (srca == 0xff)
298                     *dst = src;
299                 else
300                     *dst = over (src, *dst);
301             }
302             else if (m)
303             {
304                 d = in (src, m);
305                 *dst = over (d, *dst);
306             }
307             dst++;
308         }
309     }
310 }
311
312 static void
313 fast_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
314                                    pixman_composite_info_t *info)
315 {
316     PIXMAN_COMPOSITE_ARGS (info);
317     uint32_t src, s;
318     uint32_t    *dst_line, *dst, d;
319     uint32_t    *mask_line, *mask, ma;
320     int dst_stride, mask_stride;
321     int32_t w;
322
323     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
324
325     if (src == 0)
326         return;
327
328     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
329     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
330
331     while (height--)
332     {
333         dst = dst_line;
334         dst_line += dst_stride;
335         mask = mask_line;
336         mask_line += mask_stride;
337         w = width;
338
339         while (w--)
340         {
341             ma = *mask++;
342
343             if (ma)
344             {
345                 d = *dst;
346                 s = src;
347
348                 UN8x4_MUL_UN8x4_ADD_UN8x4 (s, ma, d);
349
350                 *dst = s;
351             }
352
353             dst++;
354         }
355     }
356 }
357
358 static void
359 fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
360                                     pixman_composite_info_t *info)
361 {
362     PIXMAN_COMPOSITE_ARGS (info);
363     uint32_t src, srca, s;
364     uint32_t    *dst_line, *dst, d;
365     uint32_t    *mask_line, *mask, ma;
366     int dst_stride, mask_stride;
367     int32_t w;
368
369     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
370
371     srca = src >> 24;
372     if (src == 0)
373         return;
374
375     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
376     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
377
378     while (height--)
379     {
380         dst = dst_line;
381         dst_line += dst_stride;
382         mask = mask_line;
383         mask_line += mask_stride;
384         w = width;
385
386         while (w--)
387         {
388             ma = *mask++;
389             if (ma == 0xffffffff)
390             {
391                 if (srca == 0xff)
392                     *dst = src;
393                 else
394                     *dst = over (src, *dst);
395             }
396             else if (ma)
397             {
398                 d = *dst;
399                 s = src;
400
401                 UN8x4_MUL_UN8x4 (s, ma);
402                 UN8x4_MUL_UN8 (ma, srca);
403                 ma = ~ma;
404                 UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
405
406                 *dst = d;
407             }
408
409             dst++;
410         }
411     }
412 }
413
414 static void
415 fast_composite_over_n_8_0888 (pixman_implementation_t *imp,
416                               pixman_composite_info_t *info)
417 {
418     PIXMAN_COMPOSITE_ARGS (info);
419     uint32_t src, srca;
420     uint8_t     *dst_line, *dst;
421     uint32_t d;
422     uint8_t     *mask_line, *mask, m;
423     int dst_stride, mask_stride;
424     int32_t w;
425
426     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
427
428     srca = src >> 24;
429     if (src == 0)
430         return;
431
432     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
433     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
434
435     while (height--)
436     {
437         dst = dst_line;
438         dst_line += dst_stride;
439         mask = mask_line;
440         mask_line += mask_stride;
441         w = width;
442
443         while (w--)
444         {
445             m = *mask++;
446             if (m == 0xff)
447             {
448                 if (srca == 0xff)
449                 {
450                     d = src;
451                 }
452                 else
453                 {
454                     d = fetch_24 (dst);
455                     d = over (src, d);
456                 }
457                 store_24 (dst, d);
458             }
459             else if (m)
460             {
461                 d = over (in (src, m), fetch_24 (dst));
462                 store_24 (dst, d);
463             }
464             dst += 3;
465         }
466     }
467 }
468
469 static void
470 fast_composite_over_n_8_0565 (pixman_implementation_t *imp,
471                               pixman_composite_info_t *info)
472 {
473     PIXMAN_COMPOSITE_ARGS (info);
474     uint32_t src, srca;
475     uint16_t    *dst_line, *dst;
476     uint32_t d;
477     uint8_t     *mask_line, *mask, m;
478     int dst_stride, mask_stride;
479     int32_t w;
480
481     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
482
483     srca = src >> 24;
484     if (src == 0)
485         return;
486
487     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
488     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
489
490     while (height--)
491     {
492         dst = dst_line;
493         dst_line += dst_stride;
494         mask = mask_line;
495         mask_line += mask_stride;
496         w = width;
497
498         while (w--)
499         {
500             m = *mask++;
501             if (m == 0xff)
502             {
503                 if (srca == 0xff)
504                 {
505                     d = src;
506                 }
507                 else
508                 {
509                     d = *dst;
510                     d = over (src, CONVERT_0565_TO_0888 (d));
511                 }
512                 *dst = CONVERT_8888_TO_0565 (d);
513             }
514             else if (m)
515             {
516                 d = *dst;
517                 d = over (in (src, m), CONVERT_0565_TO_0888 (d));
518                 *dst = CONVERT_8888_TO_0565 (d);
519             }
520             dst++;
521         }
522     }
523 }
524
525 static void
526 fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
527                                     pixman_composite_info_t *info)
528 {
529     PIXMAN_COMPOSITE_ARGS (info);
530     uint32_t  src, srca, s;
531     uint16_t  src16;
532     uint16_t *dst_line, *dst;
533     uint32_t  d;
534     uint32_t *mask_line, *mask, ma;
535     int dst_stride, mask_stride;
536     int32_t w;
537
538     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
539
540     srca = src >> 24;
541     if (src == 0)
542         return;
543
544     src16 = CONVERT_8888_TO_0565 (src);
545
546     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
547     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
548
549     while (height--)
550     {
551         dst = dst_line;
552         dst_line += dst_stride;
553         mask = mask_line;
554         mask_line += mask_stride;
555         w = width;
556
557         while (w--)
558         {
559             ma = *mask++;
560             if (ma == 0xffffffff)
561             {
562                 if (srca == 0xff)
563                 {
564                     *dst = src16;
565                 }
566                 else
567                 {
568                     d = *dst;
569                     d = over (src, CONVERT_0565_TO_0888 (d));
570                     *dst = CONVERT_8888_TO_0565 (d);
571                 }
572             }
573             else if (ma)
574             {
575                 d = *dst;
576                 d = CONVERT_0565_TO_0888 (d);
577
578                 s = src;
579
580                 UN8x4_MUL_UN8x4 (s, ma);
581                 UN8x4_MUL_UN8 (ma, srca);
582                 ma = ~ma;
583                 UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
584
585                 *dst = CONVERT_8888_TO_0565 (d);
586             }
587             dst++;
588         }
589     }
590 }
591
592 static void
593 fast_composite_over_8888_8888 (pixman_implementation_t *imp,
594                                pixman_composite_info_t *info)
595 {
596     PIXMAN_COMPOSITE_ARGS (info);
597     uint32_t    *dst_line, *dst;
598     uint32_t    *src_line, *src, s;
599     int dst_stride, src_stride;
600     uint8_t a;
601     int32_t w;
602
603     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
604     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
605
606     while (height--)
607     {
608         dst = dst_line;
609         dst_line += dst_stride;
610         src = src_line;
611         src_line += src_stride;
612         w = width;
613
614         while (w--)
615         {
616             s = *src++;
617             a = s >> 24;
618             if (a == 0xff)
619                 *dst = s;
620             else if (s)
621                 *dst = over (s, *dst);
622             dst++;
623         }
624     }
625 }
626
627 static void
628 fast_composite_src_x888_8888 (pixman_implementation_t *imp,
629                               pixman_composite_info_t *info)
630 {
631     PIXMAN_COMPOSITE_ARGS (info);
632     uint32_t    *dst_line, *dst;
633     uint32_t    *src_line, *src;
634     int dst_stride, src_stride;
635     int32_t w;
636
637     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
638     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
639
640     while (height--)
641     {
642         dst = dst_line;
643         dst_line += dst_stride;
644         src = src_line;
645         src_line += src_stride;
646         w = width;
647
648         while (w--)
649             *dst++ = (*src++) | 0xff000000;
650     }
651 }
652
653 #if 0
654 static void
655 fast_composite_over_8888_0888 (pixman_implementation_t *imp,
656                                pixman_composite_info_t *info)
657 {
658     PIXMAN_COMPOSITE_ARGS (info);
659     uint8_t     *dst_line, *dst;
660     uint32_t d;
661     uint32_t    *src_line, *src, s;
662     uint8_t a;
663     int dst_stride, src_stride;
664     int32_t w;
665
666     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
667     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
668
669     while (height--)
670     {
671         dst = dst_line;
672         dst_line += dst_stride;
673         src = src_line;
674         src_line += src_stride;
675         w = width;
676
677         while (w--)
678         {
679             s = *src++;
680             a = s >> 24;
681             if (a)
682             {
683                 if (a == 0xff)
684                     d = s;
685                 else
686                     d = over (s, fetch_24 (dst));
687
688                 store_24 (dst, d);
689             }
690             dst += 3;
691         }
692     }
693 }
694 #endif
695
696 static void
697 fast_composite_over_8888_0565 (pixman_implementation_t *imp,
698                                pixman_composite_info_t *info)
699 {
700     PIXMAN_COMPOSITE_ARGS (info);
701     uint16_t    *dst_line, *dst;
702     uint32_t d;
703     uint32_t    *src_line, *src, s;
704     uint8_t a;
705     int dst_stride, src_stride;
706     int32_t w;
707
708     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
709     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
710
711     while (height--)
712     {
713         dst = dst_line;
714         dst_line += dst_stride;
715         src = src_line;
716         src_line += src_stride;
717         w = width;
718
719         while (w--)
720         {
721             s = *src++;
722             a = s >> 24;
723             if (s)
724             {
725                 if (a == 0xff)
726                 {
727                     d = s;
728                 }
729                 else
730                 {
731                     d = *dst;
732                     d = over (s, CONVERT_0565_TO_0888 (d));
733                 }
734                 *dst = CONVERT_8888_TO_0565 (d);
735             }
736             dst++;
737         }
738     }
739 }
740
741 static void
742 fast_composite_src_x888_0565 (pixman_implementation_t *imp,
743                               pixman_composite_info_t *info)
744 {
745     PIXMAN_COMPOSITE_ARGS (info);
746     uint16_t    *dst_line, *dst;
747     uint32_t    *src_line, *src, s;
748     int dst_stride, src_stride;
749     int32_t w;
750
751     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
752     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
753
754     while (height--)
755     {
756         dst = dst_line;
757         dst_line += dst_stride;
758         src = src_line;
759         src_line += src_stride;
760         w = width;
761
762         while (w--)
763         {
764             s = *src++;
765             *dst = CONVERT_8888_TO_0565 (s);
766             dst++;
767         }
768     }
769 }
770
771 static void
772 fast_composite_add_8_8 (pixman_implementation_t *imp,
773                         pixman_composite_info_t *info)
774 {
775     PIXMAN_COMPOSITE_ARGS (info);
776     uint8_t     *dst_line, *dst;
777     uint8_t     *src_line, *src;
778     int dst_stride, src_stride;
779     int32_t w;
780     uint8_t s, d;
781     uint16_t t;
782
783     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
784     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
785
786     while (height--)
787     {
788         dst = dst_line;
789         dst_line += dst_stride;
790         src = src_line;
791         src_line += src_stride;
792         w = width;
793
794         while (w--)
795         {
796             s = *src++;
797             if (s)
798             {
799                 if (s != 0xff)
800                 {
801                     d = *dst;
802                     t = d + s;
803                     s = t | (0 - (t >> 8));
804                 }
805                 *dst = s;
806             }
807             dst++;
808         }
809     }
810 }
811
812 static void
813 fast_composite_add_0565_0565 (pixman_implementation_t *imp,
814                               pixman_composite_info_t *info)
815 {
816     PIXMAN_COMPOSITE_ARGS (info);
817     uint16_t    *dst_line, *dst;
818     uint32_t    d;
819     uint16_t    *src_line, *src;
820     uint32_t    s;
821     int dst_stride, src_stride;
822     int32_t w;
823
824     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
825     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
826
827     while (height--)
828     {
829         dst = dst_line;
830         dst_line += dst_stride;
831         src = src_line;
832         src_line += src_stride;
833         w = width;
834
835         while (w--)
836         {
837             s = *src++;
838             if (s)
839             {
840                 d = *dst;
841                 s = CONVERT_0565_TO_8888 (s);
842                 if (d)
843                 {
844                     d = CONVERT_0565_TO_8888 (d);
845                     UN8x4_ADD_UN8x4 (s, d);
846                 }
847                 *dst = CONVERT_8888_TO_0565 (s);
848             }
849             dst++;
850         }
851     }
852 }
853
854 static void
855 fast_composite_add_8888_8888 (pixman_implementation_t *imp,
856                               pixman_composite_info_t *info)
857 {
858     PIXMAN_COMPOSITE_ARGS (info);
859     uint32_t    *dst_line, *dst;
860     uint32_t    *src_line, *src;
861     int dst_stride, src_stride;
862     int32_t w;
863     uint32_t s, d;
864
865     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
866     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
867
868     while (height--)
869     {
870         dst = dst_line;
871         dst_line += dst_stride;
872         src = src_line;
873         src_line += src_stride;
874         w = width;
875
876         while (w--)
877         {
878             s = *src++;
879             if (s)
880             {
881                 if (s != 0xffffffff)
882                 {
883                     d = *dst;
884                     if (d)
885                         UN8x4_ADD_UN8x4 (s, d);
886                 }
887                 *dst = s;
888             }
889             dst++;
890         }
891     }
892 }
893
894 static void
895 fast_composite_add_n_8_8 (pixman_implementation_t *imp,
896                           pixman_composite_info_t *info)
897 {
898     PIXMAN_COMPOSITE_ARGS (info);
899     uint8_t     *dst_line, *dst;
900     uint8_t     *mask_line, *mask;
901     int dst_stride, mask_stride;
902     int32_t w;
903     uint32_t src;
904     uint8_t sa;
905
906     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
907     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
908     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
909     sa = (src >> 24);
910
911     while (height--)
912     {
913         dst = dst_line;
914         dst_line += dst_stride;
915         mask = mask_line;
916         mask_line += mask_stride;
917         w = width;
918
919         while (w--)
920         {
921             uint16_t tmp;
922             uint16_t a;
923             uint32_t m, d;
924             uint32_t r;
925
926             a = *mask++;
927             d = *dst;
928
929             m = MUL_UN8 (sa, a, tmp);
930             r = ADD_UN8 (m, d, tmp);
931
932             *dst++ = r;
933         }
934     }
935 }
936
937 #ifdef WORDS_BIGENDIAN
938 #define CREATE_BITMASK(n) (0x80000000 >> (n))
939 #define UPDATE_BITMASK(n) ((n) >> 1)
940 #else
941 #define CREATE_BITMASK(n) (1 << (n))
942 #define UPDATE_BITMASK(n) ((n) << 1)
943 #endif
944
945 #define TEST_BIT(p, n)                                  \
946     (*((p) + ((n) >> 5)) & CREATE_BITMASK ((n) & 31))
947 #define SET_BIT(p, n)                                                   \
948     do { *((p) + ((n) >> 5)) |= CREATE_BITMASK ((n) & 31); } while (0);
949
950 static void
951 fast_composite_add_1_1 (pixman_implementation_t *imp,
952                         pixman_composite_info_t *info)
953 {
954     PIXMAN_COMPOSITE_ARGS (info);
955     uint32_t     *dst_line, *dst;
956     uint32_t     *src_line, *src;
957     int           dst_stride, src_stride;
958     int32_t       w;
959
960     PIXMAN_IMAGE_GET_LINE (src_image, 0, src_y, uint32_t,
961                            src_stride, src_line, 1);
962     PIXMAN_IMAGE_GET_LINE (dest_image, 0, dest_y, uint32_t,
963                            dst_stride, dst_line, 1);
964
965     while (height--)
966     {
967         dst = dst_line;
968         dst_line += dst_stride;
969         src = src_line;
970         src_line += src_stride;
971         w = width;
972
973         while (w--)
974         {
975             /*
976              * TODO: improve performance by processing uint32_t data instead
977              *       of individual bits
978              */
979             if (TEST_BIT (src, src_x + w))
980                 SET_BIT (dst, dest_x + w);
981         }
982     }
983 }
984
985 static void
986 fast_composite_over_n_1_8888 (pixman_implementation_t *imp,
987                               pixman_composite_info_t *info)
988 {
989     PIXMAN_COMPOSITE_ARGS (info);
990     uint32_t     src, srca;
991     uint32_t    *dst, *dst_line;
992     uint32_t    *mask, *mask_line;
993     int          mask_stride, dst_stride;
994     uint32_t     bitcache, bitmask;
995     int32_t      w;
996
997     if (width <= 0)
998         return;
999
1000     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1001     srca = src >> 24;
1002     if (src == 0)
1003         return;
1004
1005     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t,
1006                            dst_stride, dst_line, 1);
1007     PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
1008                            mask_stride, mask_line, 1);
1009     mask_line += mask_x >> 5;
1010
1011     if (srca == 0xff)
1012     {
1013         while (height--)
1014         {
1015             dst = dst_line;
1016             dst_line += dst_stride;
1017             mask = mask_line;
1018             mask_line += mask_stride;
1019             w = width;
1020
1021             bitcache = *mask++;
1022             bitmask = CREATE_BITMASK (mask_x & 31);
1023
1024             while (w--)
1025             {
1026                 if (bitmask == 0)
1027                 {
1028                     bitcache = *mask++;
1029                     bitmask = CREATE_BITMASK (0);
1030                 }
1031                 if (bitcache & bitmask)
1032                     *dst = src;
1033                 bitmask = UPDATE_BITMASK (bitmask);
1034                 dst++;
1035             }
1036         }
1037     }
1038     else
1039     {
1040         while (height--)
1041         {
1042             dst = dst_line;
1043             dst_line += dst_stride;
1044             mask = mask_line;
1045             mask_line += mask_stride;
1046             w = width;
1047
1048             bitcache = *mask++;
1049             bitmask = CREATE_BITMASK (mask_x & 31);
1050
1051             while (w--)
1052             {
1053                 if (bitmask == 0)
1054                 {
1055                     bitcache = *mask++;
1056                     bitmask = CREATE_BITMASK (0);
1057                 }
1058                 if (bitcache & bitmask)
1059                     *dst = over (src, *dst);
1060                 bitmask = UPDATE_BITMASK (bitmask);
1061                 dst++;
1062             }
1063         }
1064     }
1065 }
1066
1067 static void
1068 fast_composite_over_n_1_0565 (pixman_implementation_t *imp,
1069                               pixman_composite_info_t *info)
1070 {
1071     PIXMAN_COMPOSITE_ARGS (info);
1072     uint32_t     src, srca;
1073     uint16_t    *dst, *dst_line;
1074     uint32_t    *mask, *mask_line;
1075     int          mask_stride, dst_stride;
1076     uint32_t     bitcache, bitmask;
1077     int32_t      w;
1078     uint32_t     d;
1079     uint16_t     src565;
1080
1081     if (width <= 0)
1082         return;
1083
1084     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1085     srca = src >> 24;
1086     if (src == 0)
1087         return;
1088
1089     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t,
1090                            dst_stride, dst_line, 1);
1091     PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
1092                            mask_stride, mask_line, 1);
1093     mask_line += mask_x >> 5;
1094
1095     if (srca == 0xff)
1096     {
1097         src565 = CONVERT_8888_TO_0565 (src);
1098         while (height--)
1099         {
1100             dst = dst_line;
1101             dst_line += dst_stride;
1102             mask = mask_line;
1103             mask_line += mask_stride;
1104             w = width;
1105
1106             bitcache = *mask++;
1107             bitmask = CREATE_BITMASK (mask_x & 31);
1108
1109             while (w--)
1110             {
1111                 if (bitmask == 0)
1112                 {
1113                     bitcache = *mask++;
1114                     bitmask = CREATE_BITMASK (0);
1115                 }
1116                 if (bitcache & bitmask)
1117                     *dst = src565;
1118                 bitmask = UPDATE_BITMASK (bitmask);
1119                 dst++;
1120             }
1121         }
1122     }
1123     else
1124     {
1125         while (height--)
1126         {
1127             dst = dst_line;
1128             dst_line += dst_stride;
1129             mask = mask_line;
1130             mask_line += mask_stride;
1131             w = width;
1132
1133             bitcache = *mask++;
1134             bitmask = CREATE_BITMASK (mask_x & 31);
1135
1136             while (w--)
1137             {
1138                 if (bitmask == 0)
1139                 {
1140                     bitcache = *mask++;
1141                     bitmask = CREATE_BITMASK (0);
1142                 }
1143                 if (bitcache & bitmask)
1144                 {
1145                     d = over (src, CONVERT_0565_TO_0888 (*dst));
1146                     *dst = CONVERT_8888_TO_0565 (d);
1147                 }
1148                 bitmask = UPDATE_BITMASK (bitmask);
1149                 dst++;
1150             }
1151         }
1152     }
1153 }
1154
1155 /*
1156  * Simple bitblt
1157  */
1158
1159 static void
1160 fast_composite_solid_fill (pixman_implementation_t *imp,
1161                            pixman_composite_info_t *info)
1162 {
1163     PIXMAN_COMPOSITE_ARGS (info);
1164     uint32_t src;
1165
1166     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1167
1168     if (dest_image->bits.format == PIXMAN_a1)
1169     {
1170         src = src >> 31;
1171     }
1172     else if (dest_image->bits.format == PIXMAN_a8)
1173     {
1174         src = src >> 24;
1175     }
1176     else if (dest_image->bits.format == PIXMAN_r5g6b5 ||
1177              dest_image->bits.format == PIXMAN_b5g6r5)
1178     {
1179         src = CONVERT_8888_TO_0565 (src);
1180     }
1181
1182     pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
1183                  PIXMAN_FORMAT_BPP (dest_image->bits.format),
1184                  dest_x, dest_y,
1185                  width, height,
1186                  src);
1187 }
1188
1189 static void
1190 fast_composite_src_memcpy (pixman_implementation_t *imp,
1191                            pixman_composite_info_t *info)
1192 {
1193     PIXMAN_COMPOSITE_ARGS (info);
1194     int bpp = PIXMAN_FORMAT_BPP (dest_image->bits.format) / 8;
1195     uint32_t n_bytes = width * bpp;
1196     int dst_stride, src_stride;
1197     uint8_t    *dst;
1198     uint8_t    *src;
1199
1200     src_stride = src_image->bits.rowstride * 4;
1201     dst_stride = dest_image->bits.rowstride * 4;
1202
1203     src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp;
1204     dst = (uint8_t *)dest_image->bits.bits + dest_y * dst_stride + dest_x * bpp;
1205
1206     while (height--)
1207     {
1208         memcpy (dst, src, n_bytes);
1209
1210         dst += dst_stride;
1211         src += src_stride;
1212     }
1213 }
1214
1215 FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER)
1216 FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE)
1217 FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD)
1218 FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL)
1219 FAST_NEAREST (x888_8888_cover, x888, 8888, uint32_t, uint32_t, SRC, COVER)
1220 FAST_NEAREST (x888_8888_pad, x888, 8888, uint32_t, uint32_t, SRC, PAD)
1221 FAST_NEAREST (x888_8888_normal, x888, 8888, uint32_t, uint32_t, SRC, NORMAL)
1222 FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER)
1223 FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE)
1224 FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD)
1225 FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL)
1226 FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER)
1227 FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE)
1228 FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD)
1229 FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL)
1230 FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL)
1231 FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER)
1232 FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE)
1233 FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD)
1234 FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL)
1235
1236 #define REPEAT_MIN_WIDTH    32
1237
1238 static void
1239 fast_composite_tiled_repeat (pixman_implementation_t *imp,
1240                              pixman_composite_info_t *info)
1241 {
1242     PIXMAN_COMPOSITE_ARGS (info);
1243     pixman_composite_func_t func;
1244     pixman_format_code_t mask_format;
1245     uint32_t src_flags, mask_flags;
1246
1247     src_flags = (info->src_flags & ~FAST_PATH_NORMAL_REPEAT) |
1248                     FAST_PATH_SAMPLES_COVER_CLIP_NEAREST;
1249
1250     if (mask_image)
1251     {
1252         mask_format = mask_image->common.extended_format_code;
1253         mask_flags = info->mask_flags;
1254     }
1255     else
1256     {
1257         mask_format = PIXMAN_null;
1258         mask_flags = FAST_PATH_IS_OPAQUE;
1259     }
1260
1261     if (_pixman_lookup_composite_function (
1262             imp->toplevel, info->op,
1263             src_image->common.extended_format_code, src_flags,
1264             mask_format, mask_flags,
1265             dest_image->common.extended_format_code, info->dest_flags,
1266             &imp, &func))
1267     {
1268         int32_t sx, sy;
1269         int32_t width_remain;
1270         int32_t num_pixels;
1271         int32_t src_width;
1272         int32_t i, j;
1273         pixman_image_t extended_src_image;
1274         uint32_t extended_src[REPEAT_MIN_WIDTH * 2];
1275         pixman_bool_t need_src_extension;
1276         uint32_t *src_line;
1277         int32_t src_stride;
1278         int32_t src_bpp;
1279         pixman_composite_info_t info2 = *info;
1280
1281         src_bpp = PIXMAN_FORMAT_BPP (src_image->bits.format);
1282
1283         if (src_image->bits.width < REPEAT_MIN_WIDTH &&
1284             (src_bpp == 32 || src_bpp == 16 || src_bpp == 8))
1285         {
1286             sx = src_x;
1287             sx = MOD (sx, src_image->bits.width);
1288             sx += width;
1289             src_width = 0;
1290
1291             while (src_width < REPEAT_MIN_WIDTH && src_width <= sx)
1292                 src_width += src_image->bits.width;
1293
1294             src_stride = (src_width * (src_bpp >> 3) + 3) / (int) sizeof (uint32_t);
1295
1296             /* Initialize/validate stack-allocated temporary image */
1297             _pixman_bits_image_init (&extended_src_image, src_image->bits.format,
1298                                      src_width, 1, &extended_src[0], src_stride);
1299             _pixman_image_validate (&extended_src_image);
1300
1301             info2.src_image = &extended_src_image;
1302             need_src_extension = TRUE;
1303         }
1304         else
1305         {
1306             src_width = src_image->bits.width;
1307             need_src_extension = FALSE;
1308         }
1309
1310         sx = src_x;
1311         sy = src_y;
1312
1313         while (--height >= 0)
1314         {
1315             sx = MOD (sx, src_width);
1316             sy = MOD (sy, src_image->bits.height);
1317
1318             if (need_src_extension)
1319             {
1320                 if (src_bpp == 32)
1321                 {
1322                     PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint32_t, src_stride, src_line, 1);
1323
1324                     for (i = 0; i < src_width; )
1325                     {
1326                         for (j = 0; j < src_image->bits.width; j++, i++)
1327                             extended_src[i] = src_line[j];
1328                     }
1329                 }
1330                 else if (src_bpp == 16)
1331                 {
1332                     uint16_t *src_line_16;
1333
1334                     PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint16_t, src_stride,
1335                                            src_line_16, 1);
1336                     src_line = (uint32_t*)src_line_16;
1337
1338                     for (i = 0; i < src_width; )
1339                     {
1340                         for (j = 0; j < src_image->bits.width; j++, i++)
1341                             ((uint16_t*)extended_src)[i] = ((uint16_t*)src_line)[j];
1342                     }
1343                 }
1344                 else if (src_bpp == 8)
1345                 {
1346                     uint8_t *src_line_8;
1347
1348                     PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint8_t, src_stride,
1349                                            src_line_8, 1);
1350                     src_line = (uint32_t*)src_line_8;
1351
1352                     for (i = 0; i < src_width; )
1353                     {
1354                         for (j = 0; j < src_image->bits.width; j++, i++)
1355                             ((uint8_t*)extended_src)[i] = ((uint8_t*)src_line)[j];
1356                     }
1357                 }
1358
1359                 info2.src_y = 0;
1360             }
1361             else
1362             {
1363                 info2.src_y = sy;
1364             }
1365
1366             width_remain = width;
1367
1368             while (width_remain > 0)
1369             {
1370                 num_pixels = src_width - sx;
1371
1372                 if (num_pixels > width_remain)
1373                     num_pixels = width_remain;
1374
1375                 info2.src_x = sx;
1376                 info2.width = num_pixels;
1377                 info2.height = 1;
1378
1379                 func (imp, &info2);
1380
1381                 width_remain -= num_pixels;
1382                 info2.mask_x += num_pixels;
1383                 info2.dest_x += num_pixels;
1384                 sx = 0;
1385             }
1386
1387             sx = src_x;
1388             sy++;
1389             info2.mask_x = info->mask_x;
1390             info2.mask_y++;
1391             info2.dest_x = info->dest_x;
1392             info2.dest_y++;
1393         }
1394
1395         if (need_src_extension)
1396             _pixman_image_fini (&extended_src_image);
1397     }
1398     else
1399     {
1400         _pixman_log_error (FUNC, "Didn't find a suitable function ");
1401     }
1402 }
1403
1404 /* Use more unrolling for src_0565_0565 because it is typically CPU bound */
1405 static force_inline void
1406 scaled_nearest_scanline_565_565_SRC (uint16_t *       dst,
1407                                      const uint16_t * src,
1408                                      int32_t          w,
1409                                      pixman_fixed_t   vx,
1410                                      pixman_fixed_t   unit_x,
1411                                      pixman_fixed_t   max_vx,
1412                                      pixman_bool_t    fully_transparent_src)
1413 {
1414     uint16_t tmp1, tmp2, tmp3, tmp4;
1415     while ((w -= 4) >= 0)
1416     {
1417         tmp1 = src[pixman_fixed_to_int (vx)];
1418         vx += unit_x;
1419         tmp2 = src[pixman_fixed_to_int (vx)];
1420         vx += unit_x;
1421         tmp3 = src[pixman_fixed_to_int (vx)];
1422         vx += unit_x;
1423         tmp4 = src[pixman_fixed_to_int (vx)];
1424         vx += unit_x;
1425         *dst++ = tmp1;
1426         *dst++ = tmp2;
1427         *dst++ = tmp3;
1428         *dst++ = tmp4;
1429     }
1430     if (w & 2)
1431     {
1432         tmp1 = src[pixman_fixed_to_int (vx)];
1433         vx += unit_x;
1434         tmp2 = src[pixman_fixed_to_int (vx)];
1435         vx += unit_x;
1436         *dst++ = tmp1;
1437         *dst++ = tmp2;
1438     }
1439     if (w & 1)
1440         *dst++ = src[pixman_fixed_to_int (vx)];
1441 }
1442
1443 FAST_NEAREST_MAINLOOP (565_565_cover_SRC,
1444                        scaled_nearest_scanline_565_565_SRC,
1445                        uint16_t, uint16_t, COVER)
1446 FAST_NEAREST_MAINLOOP (565_565_none_SRC,
1447                        scaled_nearest_scanline_565_565_SRC,
1448                        uint16_t, uint16_t, NONE)
1449 FAST_NEAREST_MAINLOOP (565_565_pad_SRC,
1450                        scaled_nearest_scanline_565_565_SRC,
1451                        uint16_t, uint16_t, PAD)
1452
1453 static force_inline uint32_t
1454 fetch_nearest (pixman_repeat_t src_repeat,
1455                pixman_format_code_t format,
1456                uint32_t *src, int x, int src_width)
1457 {
1458     if (repeat (src_repeat, &x, src_width))
1459     {
1460         if (format == PIXMAN_x8r8g8b8)
1461             return *(src + x) | 0xff000000;
1462         else
1463             return *(src + x);
1464     }
1465     else
1466     {
1467         return 0;
1468     }
1469 }
1470
1471 static force_inline void
1472 combine_over (uint32_t s, uint32_t *dst)
1473 {
1474     if (s)
1475     {
1476         uint8_t ia = 0xff - (s >> 24);
1477
1478         if (ia)
1479             UN8x4_MUL_UN8_ADD_UN8x4 (*dst, ia, s);
1480         else
1481             *dst = s;
1482     }
1483 }
1484
1485 static force_inline void
1486 combine_src (uint32_t s, uint32_t *dst)
1487 {
1488     *dst = s;
1489 }
1490
1491 static void
1492 fast_composite_scaled_nearest (pixman_implementation_t *imp,
1493                                pixman_composite_info_t *info)
1494 {
1495     PIXMAN_COMPOSITE_ARGS (info);
1496     uint32_t       *dst_line;
1497     uint32_t       *src_line;
1498     int             dst_stride, src_stride;
1499     int             src_width, src_height;
1500     pixman_repeat_t src_repeat;
1501     pixman_fixed_t unit_x, unit_y;
1502     pixman_format_code_t src_format;
1503     pixman_vector_t v;
1504     pixman_fixed_t vy;
1505
1506     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1507     /* pass in 0 instead of src_x and src_y because src_x and src_y need to be
1508      * transformed from destination space to source space
1509      */
1510     PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, uint32_t, src_stride, src_line, 1);
1511
1512     /* reference point is the center of the pixel */
1513     v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;
1514     v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;
1515     v.vector[2] = pixman_fixed_1;
1516
1517     if (!pixman_transform_point_3d (src_image->common.transform, &v))
1518         return;
1519
1520     unit_x = src_image->common.transform->matrix[0][0];
1521     unit_y = src_image->common.transform->matrix[1][1];
1522
1523     /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */
1524     v.vector[0] -= pixman_fixed_e;
1525     v.vector[1] -= pixman_fixed_e;
1526
1527     src_height = src_image->bits.height;
1528     src_width = src_image->bits.width;
1529     src_repeat = src_image->common.repeat;
1530     src_format = src_image->bits.format;
1531
1532     vy = v.vector[1];
1533     while (height--)
1534     {
1535         pixman_fixed_t vx = v.vector[0];
1536         int y = pixman_fixed_to_int (vy);
1537         uint32_t *dst = dst_line;
1538
1539         dst_line += dst_stride;
1540
1541         /* adjust the y location by a unit vector in the y direction
1542          * this is equivalent to transforming y+1 of the destination point to source space */
1543         vy += unit_y;
1544
1545         if (!repeat (src_repeat, &y, src_height))
1546         {
1547             if (op == PIXMAN_OP_SRC)
1548                 memset (dst, 0, sizeof (*dst) * width);
1549         }
1550         else
1551         {
1552             int w = width;
1553
1554             uint32_t *src = src_line + y * src_stride;
1555
1556             while (w >= 2)
1557             {
1558                 uint32_t s1, s2;
1559                 int x1, x2;
1560
1561                 x1 = pixman_fixed_to_int (vx);
1562                 vx += unit_x;
1563
1564                 x2 = pixman_fixed_to_int (vx);
1565                 vx += unit_x;
1566
1567                 w -= 2;
1568
1569                 s1 = fetch_nearest (src_repeat, src_format, src, x1, src_width);
1570                 s2 = fetch_nearest (src_repeat, src_format, src, x2, src_width);
1571
1572                 if (op == PIXMAN_OP_OVER)
1573                 {
1574                     combine_over (s1, dst++);
1575                     combine_over (s2, dst++);
1576                 }
1577                 else
1578                 {
1579                     combine_src (s1, dst++);
1580                     combine_src (s2, dst++);
1581                 }
1582             }
1583
1584             while (w--)
1585             {
1586                 uint32_t s;
1587                 int x;
1588
1589                 x = pixman_fixed_to_int (vx);
1590                 vx += unit_x;
1591
1592                 s = fetch_nearest (src_repeat, src_format, src, x, src_width);
1593
1594                 if (op == PIXMAN_OP_OVER)
1595                     combine_over (s, dst++);
1596                 else
1597                     combine_src (s, dst++);
1598             }
1599         }
1600     }
1601 }
1602
1603 #define CACHE_LINE_SIZE 64
1604
1605 #define FAST_SIMPLE_ROTATE(suffix, pix_type)                                  \
1606                                                                               \
1607 static void                                                                   \
1608 blt_rotated_90_trivial_##suffix (pix_type       *dst,                         \
1609                                  int             dst_stride,                  \
1610                                  const pix_type *src,                         \
1611                                  int             src_stride,                  \
1612                                  int             w,                           \
1613                                  int             h)                           \
1614 {                                                                             \
1615     int x, y;                                                                 \
1616     for (y = 0; y < h; y++)                                                   \
1617     {                                                                         \
1618         const pix_type *s = src + (h - y - 1);                                \
1619         pix_type *d = dst + dst_stride * y;                                   \
1620         for (x = 0; x < w; x++)                                               \
1621         {                                                                     \
1622             *d++ = *s;                                                        \
1623             s += src_stride;                                                  \
1624         }                                                                     \
1625     }                                                                         \
1626 }                                                                             \
1627                                                                               \
1628 static void                                                                   \
1629 blt_rotated_270_trivial_##suffix (pix_type       *dst,                        \
1630                                   int             dst_stride,                 \
1631                                   const pix_type *src,                        \
1632                                   int             src_stride,                 \
1633                                   int             w,                          \
1634                                   int             h)                          \
1635 {                                                                             \
1636     int x, y;                                                                 \
1637     for (y = 0; y < h; y++)                                                   \
1638     {                                                                         \
1639         const pix_type *s = src + src_stride * (w - 1) + y;                   \
1640         pix_type *d = dst + dst_stride * y;                                   \
1641         for (x = 0; x < w; x++)                                               \
1642         {                                                                     \
1643             *d++ = *s;                                                        \
1644             s -= src_stride;                                                  \
1645         }                                                                     \
1646     }                                                                         \
1647 }                                                                             \
1648                                                                               \
1649 static void                                                                   \
1650 blt_rotated_90_##suffix (pix_type       *dst,                                 \
1651                          int             dst_stride,                          \
1652                          const pix_type *src,                                 \
1653                          int             src_stride,                          \
1654                          int             W,                                   \
1655                          int             H)                                   \
1656 {                                                                             \
1657     int x;                                                                    \
1658     int leading_pixels = 0, trailing_pixels = 0;                              \
1659     const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type);                 \
1660                                                                               \
1661     /*                                                                        \
1662      * split processing into handling destination as TILE_SIZExH cache line   \
1663      * aligned vertical stripes (optimistically assuming that destination     \
1664      * stride is a multiple of cache line, if not - it will be just a bit     \
1665      * slower)                                                                \
1666      */                                                                       \
1667                                                                               \
1668     if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1))                               \
1669     {                                                                         \
1670         leading_pixels = TILE_SIZE - (((uintptr_t)dst &                       \
1671                             (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
1672         if (leading_pixels > W)                                               \
1673             leading_pixels = W;                                               \
1674                                                                               \
1675         /* unaligned leading part NxH (where N < TILE_SIZE) */                \
1676         blt_rotated_90_trivial_##suffix (                                     \
1677             dst,                                                              \
1678             dst_stride,                                                       \
1679             src,                                                              \
1680             src_stride,                                                       \
1681             leading_pixels,                                                   \
1682             H);                                                               \
1683                                                                               \
1684         dst += leading_pixels;                                                \
1685         src += leading_pixels * src_stride;                                   \
1686         W -= leading_pixels;                                                  \
1687     }                                                                         \
1688                                                                               \
1689     if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1))                         \
1690     {                                                                         \
1691         trailing_pixels = (((uintptr_t)(dst + W) &                            \
1692                             (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
1693         if (trailing_pixels > W)                                              \
1694             trailing_pixels = W;                                              \
1695         W -= trailing_pixels;                                                 \
1696     }                                                                         \
1697                                                                               \
1698     for (x = 0; x < W; x += TILE_SIZE)                                        \
1699     {                                                                         \
1700         /* aligned middle part TILE_SIZExH */                                 \
1701         blt_rotated_90_trivial_##suffix (                                     \
1702             dst + x,                                                          \
1703             dst_stride,                                                       \
1704             src + src_stride * x,                                             \
1705             src_stride,                                                       \
1706             TILE_SIZE,                                                        \
1707             H);                                                               \
1708     }                                                                         \
1709                                                                               \
1710     if (trailing_pixels)                                                      \
1711     {                                                                         \
1712         /* unaligned trailing part NxH (where N < TILE_SIZE) */               \
1713         blt_rotated_90_trivial_##suffix (                                     \
1714             dst + W,                                                          \
1715             dst_stride,                                                       \
1716             src + W * src_stride,                                             \
1717             src_stride,                                                       \
1718             trailing_pixels,                                                  \
1719             H);                                                               \
1720     }                                                                         \
1721 }                                                                             \
1722                                                                               \
1723 static void                                                                   \
1724 blt_rotated_270_##suffix (pix_type       *dst,                                \
1725                           int             dst_stride,                         \
1726                           const pix_type *src,                                \
1727                           int             src_stride,                         \
1728                           int             W,                                  \
1729                           int             H)                                  \
1730 {                                                                             \
1731     int x;                                                                    \
1732     int leading_pixels = 0, trailing_pixels = 0;                              \
1733     const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type);                 \
1734                                                                               \
1735     /*                                                                        \
1736      * split processing into handling destination as TILE_SIZExH cache line   \
1737      * aligned vertical stripes (optimistically assuming that destination     \
1738      * stride is a multiple of cache line, if not - it will be just a bit     \
1739      * slower)                                                                \
1740      */                                                                       \
1741                                                                               \
1742     if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1))                               \
1743     {                                                                         \
1744         leading_pixels = TILE_SIZE - (((uintptr_t)dst &                       \
1745                             (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
1746         if (leading_pixels > W)                                               \
1747             leading_pixels = W;                                               \
1748                                                                               \
1749         /* unaligned leading part NxH (where N < TILE_SIZE) */                \
1750         blt_rotated_270_trivial_##suffix (                                    \
1751             dst,                                                              \
1752             dst_stride,                                                       \
1753             src + src_stride * (W - leading_pixels),                          \
1754             src_stride,                                                       \
1755             leading_pixels,                                                   \
1756             H);                                                               \
1757                                                                               \
1758         dst += leading_pixels;                                                \
1759         W -= leading_pixels;                                                  \
1760     }                                                                         \
1761                                                                               \
1762     if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1))                         \
1763     {                                                                         \
1764         trailing_pixels = (((uintptr_t)(dst + W) &                            \
1765                             (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
1766         if (trailing_pixels > W)                                              \
1767             trailing_pixels = W;                                              \
1768         W -= trailing_pixels;                                                 \
1769         src += trailing_pixels * src_stride;                                  \
1770     }                                                                         \
1771                                                                               \
1772     for (x = 0; x < W; x += TILE_SIZE)                                        \
1773     {                                                                         \
1774         /* aligned middle part TILE_SIZExH */                                 \
1775         blt_rotated_270_trivial_##suffix (                                    \
1776             dst + x,                                                          \
1777             dst_stride,                                                       \
1778             src + src_stride * (W - x - TILE_SIZE),                           \
1779             src_stride,                                                       \
1780             TILE_SIZE,                                                        \
1781             H);                                                               \
1782     }                                                                         \
1783                                                                               \
1784     if (trailing_pixels)                                                      \
1785     {                                                                         \
1786         /* unaligned trailing part NxH (where N < TILE_SIZE) */               \
1787         blt_rotated_270_trivial_##suffix (                                    \
1788             dst + W,                                                          \
1789             dst_stride,                                                       \
1790             src - trailing_pixels * src_stride,                               \
1791             src_stride,                                                       \
1792             trailing_pixels,                                                  \
1793             H);                                                               \
1794     }                                                                         \
1795 }                                                                             \
1796                                                                               \
1797 static void                                                                   \
1798 fast_composite_rotate_90_##suffix (pixman_implementation_t *imp,              \
1799                                    pixman_composite_info_t *info)             \
1800 {                                                                             \
1801     PIXMAN_COMPOSITE_ARGS (info);                                             \
1802     pix_type       *dst_line;                                                 \
1803     pix_type       *src_line;                                                 \
1804     int             dst_stride, src_stride;                                   \
1805     int             src_x_t, src_y_t;                                         \
1806                                                                               \
1807     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, pix_type,              \
1808                            dst_stride, dst_line, 1);                          \
1809     src_x_t = -src_y + pixman_fixed_to_int (                                  \
1810                                 src_image->common.transform->matrix[0][2] +   \
1811                                 pixman_fixed_1 / 2 - pixman_fixed_e) - height;\
1812     src_y_t = src_x + pixman_fixed_to_int (                                   \
1813                                 src_image->common.transform->matrix[1][2] +   \
1814                                 pixman_fixed_1 / 2 - pixman_fixed_e);         \
1815     PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type,             \
1816                            src_stride, src_line, 1);                          \
1817     blt_rotated_90_##suffix (dst_line, dst_stride, src_line, src_stride,      \
1818                              width, height);                                  \
1819 }                                                                             \
1820                                                                               \
1821 static void                                                                   \
1822 fast_composite_rotate_270_##suffix (pixman_implementation_t *imp,             \
1823                                     pixman_composite_info_t *info)            \
1824 {                                                                             \
1825     PIXMAN_COMPOSITE_ARGS (info);                                             \
1826     pix_type       *dst_line;                                                 \
1827     pix_type       *src_line;                                                 \
1828     int             dst_stride, src_stride;                                   \
1829     int             src_x_t, src_y_t;                                         \
1830                                                                               \
1831     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, pix_type,              \
1832                            dst_stride, dst_line, 1);                          \
1833     src_x_t = src_y + pixman_fixed_to_int (                                   \
1834                                 src_image->common.transform->matrix[0][2] +   \
1835                                 pixman_fixed_1 / 2 - pixman_fixed_e);         \
1836     src_y_t = -src_x + pixman_fixed_to_int (                                  \
1837                                 src_image->common.transform->matrix[1][2] +   \
1838                                 pixman_fixed_1 / 2 - pixman_fixed_e) - width; \
1839     PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type,             \
1840                            src_stride, src_line, 1);                          \
1841     blt_rotated_270_##suffix (dst_line, dst_stride, src_line, src_stride,     \
1842                               width, height);                                 \
1843 }
1844
1845 FAST_SIMPLE_ROTATE (8, uint8_t)
1846 FAST_SIMPLE_ROTATE (565, uint16_t)
1847 FAST_SIMPLE_ROTATE (8888, uint32_t)
1848
1849 static const pixman_fast_path_t c_fast_paths[] =
1850 {
1851     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, fast_composite_over_n_8_0565),
1852     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, fast_composite_over_n_8_0565),
1853     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, fast_composite_over_n_8_0888),
1854     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, fast_composite_over_n_8_0888),
1855     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, fast_composite_over_n_8_8888),
1856     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, fast_composite_over_n_8_8888),
1857     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, fast_composite_over_n_8_8888),
1858     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, fast_composite_over_n_8_8888),
1859     PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, fast_composite_over_n_1_8888),
1860     PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, fast_composite_over_n_1_8888),
1861     PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, fast_composite_over_n_1_8888),
1862     PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, fast_composite_over_n_1_8888),
1863     PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5,   fast_composite_over_n_1_0565),
1864     PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5,   fast_composite_over_n_1_0565),
1865     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, fast_composite_over_n_8888_8888_ca),
1866     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, fast_composite_over_n_8888_8888_ca),
1867     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, fast_composite_over_n_8888_0565_ca),
1868     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, fast_composite_over_n_8888_8888_ca),
1869     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, fast_composite_over_n_8888_8888_ca),
1870     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, fast_composite_over_n_8888_0565_ca),
1871     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, fast_composite_over_x888_8_8888),
1872     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, fast_composite_over_x888_8_8888),
1873     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, fast_composite_over_x888_8_8888),
1874     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, fast_composite_over_x888_8_8888),
1875     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, fast_composite_over_8888_8888),
1876     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, fast_composite_over_8888_8888),
1877     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, fast_composite_over_8888_0565),
1878     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, fast_composite_over_8888_8888),
1879     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, fast_composite_over_8888_8888),
1880     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565),
1881     PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, fast_composite_add_0565_0565),
1882     PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, fast_composite_add_0565_0565),
1883     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888),
1884     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888),
1885     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8),
1886     PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1_1),
1887     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca),
1888     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8),
1889     PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, fast_composite_solid_fill),
1890     PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, fast_composite_solid_fill),
1891     PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, fast_composite_solid_fill),
1892     PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, fast_composite_solid_fill),
1893     PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, fast_composite_solid_fill),
1894     PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, fast_composite_solid_fill),
1895     PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, fast_composite_solid_fill),
1896     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, fast_composite_src_x888_8888),
1897     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, fast_composite_src_x888_8888),
1898     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
1899     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, fast_composite_src_memcpy),
1900     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
1901     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
1902     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, fast_composite_src_memcpy),
1903     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
1904     PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8x8, fast_composite_src_memcpy),
1905     PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8a8, fast_composite_src_memcpy),
1906     PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, fast_composite_src_memcpy),
1907     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, fast_composite_src_memcpy),
1908     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, fast_composite_src_memcpy),
1909     PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, fast_composite_src_memcpy),
1910     PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, fast_composite_src_memcpy),
1911     PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
1912     PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
1913     PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy),
1914     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
1915     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
1916     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
1917     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
1918     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8),
1919     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8),
1920
1921     SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888_8888),
1922     SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888_8888),
1923     SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, 8888_8888),
1924     SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, 8888_8888),
1925
1926     SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888_8888),
1927     SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, 8888_8888),
1928
1929     SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, 8888_565),
1930     SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, 8888_565),
1931
1932     SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565),
1933
1934     SIMPLE_NEAREST_FAST_PATH_COVER (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
1935     SIMPLE_NEAREST_FAST_PATH_COVER (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
1936     SIMPLE_NEAREST_FAST_PATH_PAD (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
1937     SIMPLE_NEAREST_FAST_PATH_PAD (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
1938     SIMPLE_NEAREST_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
1939     SIMPLE_NEAREST_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
1940
1941     SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, 8888_8888),
1942     SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, 8888_8888),
1943     SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, 8888_8888),
1944     SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, 8888_8888),
1945
1946     SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565),
1947
1948 #define NEAREST_FAST_PATH(op,s,d)               \
1949     {   PIXMAN_OP_ ## op,                       \
1950         PIXMAN_ ## s, SCALED_NEAREST_FLAGS,     \
1951         PIXMAN_null, 0,                         \
1952         PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
1953         fast_composite_scaled_nearest,          \
1954     }
1955
1956     NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8),
1957     NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8),
1958     NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8),
1959     NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8),
1960
1961     NEAREST_FAST_PATH (SRC, x8r8g8b8, a8r8g8b8),
1962     NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8),
1963     NEAREST_FAST_PATH (SRC, x8b8g8r8, a8b8g8r8),
1964     NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8),
1965
1966     NEAREST_FAST_PATH (OVER, x8r8g8b8, x8r8g8b8),
1967     NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8),
1968     NEAREST_FAST_PATH (OVER, x8b8g8r8, x8b8g8r8),
1969     NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8),
1970
1971     NEAREST_FAST_PATH (OVER, x8r8g8b8, a8r8g8b8),
1972     NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8),
1973     NEAREST_FAST_PATH (OVER, x8b8g8r8, a8b8g8r8),
1974     NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8),
1975
1976 #define SIMPLE_ROTATE_FLAGS(angle)                                        \
1977     (FAST_PATH_ROTATE_ ## angle ## _TRANSFORM   |                         \
1978      FAST_PATH_NEAREST_FILTER                   |                         \
1979      FAST_PATH_SAMPLES_COVER_CLIP_NEAREST       |                         \
1980      FAST_PATH_STANDARD_FLAGS)
1981
1982 #define SIMPLE_ROTATE_FAST_PATH(op,s,d,suffix)                            \
1983     {   PIXMAN_OP_ ## op,                                                 \
1984         PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (90),                           \
1985         PIXMAN_null, 0,                                                   \
1986         PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                           \
1987         fast_composite_rotate_90_##suffix,                                \
1988     },                                                                    \
1989     {   PIXMAN_OP_ ## op,                                                 \
1990         PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (270),                          \
1991         PIXMAN_null, 0,                                                   \
1992         PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                           \
1993         fast_composite_rotate_270_##suffix,                               \
1994     }
1995
1996     SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888),
1997     SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888),
1998     SIMPLE_ROTATE_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888),
1999     SIMPLE_ROTATE_FAST_PATH (SRC, r5g6b5, r5g6b5, 565),
2000     SIMPLE_ROTATE_FAST_PATH (SRC, a8, a8, 8),
2001
2002     /* Simple repeat fast path entry. */
2003     {   PIXMAN_OP_any,
2004         PIXMAN_any,
2005         (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | FAST_PATH_BITS_IMAGE |
2006          FAST_PATH_NORMAL_REPEAT),
2007         PIXMAN_any, 0,
2008         PIXMAN_any, FAST_PATH_STD_DEST_FLAGS,
2009         fast_composite_tiled_repeat
2010     },
2011
2012     {   PIXMAN_OP_NONE  },
2013 };
2014
2015 #ifdef WORDS_BIGENDIAN
2016 #define A1_FILL_MASK(n, offs) (((1U << (n)) - 1) << (32 - (offs) - (n)))
2017 #else
2018 #define A1_FILL_MASK(n, offs) (((1U << (n)) - 1) << (offs))
2019 #endif
2020
2021 static force_inline void
2022 pixman_fill1_line (uint32_t *dst, int offs, int width, int v)
2023 {
2024     if (offs)
2025     {
2026         int leading_pixels = 32 - offs;
2027         if (leading_pixels >= width)
2028         {
2029             if (v)
2030                 *dst |= A1_FILL_MASK (width, offs);
2031             else
2032                 *dst &= ~A1_FILL_MASK (width, offs);
2033             return;
2034         }
2035         else
2036         {
2037             if (v)
2038                 *dst++ |= A1_FILL_MASK (leading_pixels, offs);
2039             else
2040                 *dst++ &= ~A1_FILL_MASK (leading_pixels, offs);
2041             width -= leading_pixels;
2042         }
2043     }
2044     while (width >= 32)
2045     {
2046         if (v)
2047             *dst++ = 0xFFFFFFFF;
2048         else
2049             *dst++ = 0;
2050         width -= 32;
2051     }
2052     if (width > 0)
2053     {
2054         if (v)
2055             *dst |= A1_FILL_MASK (width, 0);
2056         else
2057             *dst &= ~A1_FILL_MASK (width, 0);
2058     }
2059 }
2060
2061 static void
2062 pixman_fill1 (uint32_t *bits,
2063               int       stride,
2064               int       x,
2065               int       y,
2066               int       width,
2067               int       height,
2068               uint32_t  xor)
2069 {
2070     uint32_t *dst = bits + y * stride + (x >> 5);
2071     int offs = x & 31;
2072
2073     if (xor & 1)
2074     {
2075         while (height--)
2076         {
2077             pixman_fill1_line (dst, offs, width, 1);
2078             dst += stride;
2079         }
2080     }
2081     else
2082     {
2083         while (height--)
2084         {
2085             pixman_fill1_line (dst, offs, width, 0);
2086             dst += stride;
2087         }
2088     }
2089 }
2090
2091 static void
2092 pixman_fill8 (uint32_t *bits,
2093               int       stride,
2094               int       x,
2095               int       y,
2096               int       width,
2097               int       height,
2098               uint32_t xor)
2099 {
2100     int byte_stride = stride * (int) sizeof (uint32_t);
2101     uint8_t *dst = (uint8_t *) bits;
2102     uint8_t v = xor & 0xff;
2103     int i;
2104
2105     dst = dst + y * byte_stride + x;
2106
2107     while (height--)
2108     {
2109         for (i = 0; i < width; ++i)
2110             dst[i] = v;
2111
2112         dst += byte_stride;
2113     }
2114 }
2115
2116 static void
2117 pixman_fill16 (uint32_t *bits,
2118                int       stride,
2119                int       x,
2120                int       y,
2121                int       width,
2122                int       height,
2123                uint32_t xor)
2124 {
2125     int short_stride =
2126         (stride * (int)sizeof (uint32_t)) / (int)sizeof (uint16_t);
2127     uint16_t *dst = (uint16_t *)bits;
2128     uint16_t v = xor & 0xffff;
2129     int i;
2130
2131     dst = dst + y * short_stride + x;
2132
2133     while (height--)
2134     {
2135         for (i = 0; i < width; ++i)
2136             dst[i] = v;
2137
2138         dst += short_stride;
2139     }
2140 }
2141
2142 static void
2143 pixman_fill32 (uint32_t *bits,
2144                int       stride,
2145                int       x,
2146                int       y,
2147                int       width,
2148                int       height,
2149                uint32_t  xor)
2150 {
2151     int i;
2152
2153     bits = bits + y * stride + x;
2154
2155     while (height--)
2156     {
2157         for (i = 0; i < width; ++i)
2158             bits[i] = xor;
2159
2160         bits += stride;
2161     }
2162 }
2163
2164 static pixman_bool_t
2165 fast_path_fill (pixman_implementation_t *imp,
2166                 uint32_t *               bits,
2167                 int                      stride,
2168                 int                      bpp,
2169                 int                      x,
2170                 int                      y,
2171                 int                      width,
2172                 int                      height,
2173                 uint32_t                 xor)
2174 {
2175     switch (bpp)
2176     {
2177     case 1:
2178         pixman_fill1 (bits, stride, x, y, width, height, xor);
2179         break;
2180
2181     case 8:
2182         pixman_fill8 (bits, stride, x, y, width, height, xor);
2183         break;
2184
2185     case 16:
2186         pixman_fill16 (bits, stride, x, y, width, height, xor);
2187         break;
2188
2189     case 32:
2190         pixman_fill32 (bits, stride, x, y, width, height, xor);
2191         break;
2192
2193     default:
2194         return _pixman_implementation_fill (
2195             imp->delegate, bits, stride, bpp, x, y, width, height, xor);
2196         break;
2197     }
2198
2199     return TRUE;
2200 }
2201
2202 pixman_implementation_t *
2203 _pixman_implementation_create_fast_path (pixman_implementation_t *fallback)
2204 {
2205     pixman_implementation_t *imp = _pixman_implementation_create (fallback, c_fast_paths);
2206
2207     imp->fill = fast_path_fill;
2208
2209     return imp;
2210 }