Initialize the ARM SIMD fast path array.
[profile/ivi/pixman.git] / pixman / pixman-arm-simd.c
1 /*
2  * Copyright © 2008 Mozilla Corporation
3  *
4  * Permission to use, copy, modify, distribute, and sell this software and its
5  * documentation for any purpose is hereby granted without fee, provided that
6  * the above copyright notice appear in all copies and that both that
7  * copyright notice and this permission notice appear in supporting
8  * documentation, and that the name of Mozilla Corporation not be used in
9  * advertising or publicity pertaining to distribution of the software without
10  * specific, written prior permission.  Mozilla Corporation makes no
11  * representations about the suitability of this software for any purpose.  It
12  * is provided "as is" without express or implied warranty.
13  *
14  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
15  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
16  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
17  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
19  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
20  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
21  * SOFTWARE.
22  *
23  * Author:  Jeff Muizelaar (jeff@infidigm.net)
24  *
25  */
26 #ifdef HAVE_CONFIG_H
27 #include <config.h>
28 #endif
29
30 #include "pixman-arm-simd.h"
31
32 void
33 fbCompositeSrcAdd_8000x8000arm (pixman_op_t op,
34                                 pixman_image_t * pSrc,
35                                 pixman_image_t * pMask,
36                                 pixman_image_t * pDst,
37                                 int16_t      xSrc,
38                                 int16_t      ySrc,
39                                 int16_t      xMask,
40                                 int16_t      yMask,
41                                 int16_t      xDst,
42                                 int16_t      yDst,
43                                 uint16_t     width,
44                                 uint16_t     height)
45 {
46     uint8_t     *dstLine, *dst;
47     uint8_t     *srcLine, *src;
48     int dstStride, srcStride;
49     uint16_t    w;
50     uint8_t     s, d;
51
52     fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
53     fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
54
55     while (height--)
56     {
57         dst = dstLine;
58         dstLine += dstStride;
59         src = srcLine;
60         srcLine += srcStride;
61         w = width;
62
63         /* ensure both src and dst are properly aligned before doing 32 bit reads
64          * we'll stay in this loop if src and dst have differing alignments */
65         while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
66         {
67             s = *src;
68             d = *dst;
69             asm("uqadd8 %0, %1, %2" : "+r"(d) : "r"(s));
70             *dst = d;
71
72             dst++;
73             src++;
74             w--;
75         }
76
77         while (w >= 4)
78         {
79             asm("uqadd8 %0, %1, %2" : "=r"(*(uint32_t*)dst) : "r"(*(uint32_t*)src), "r"(*(uint32_t*)dst));
80             dst += 4;
81             src += 4;
82             w -= 4;
83         }
84
85         while (w)
86         {
87             s = *src;
88             d = *dst;
89             asm("uqadd8 %0, %1, %2" : "+r"(d) : "r"(s));
90             *dst = d;
91
92             dst++;
93             src++;
94             w--;
95         }
96     }
97
98 }
99
100 void
101 fbCompositeSrc_8888x8888arm (pixman_op_t op,
102                          pixman_image_t * pSrc,
103                          pixman_image_t * pMask,
104                          pixman_image_t * pDst,
105                          int16_t      xSrc,
106                          int16_t      ySrc,
107                          int16_t      xMask,
108                          int16_t      yMask,
109                          int16_t      xDst,
110                          int16_t      yDst,
111                          uint16_t     width,
112                          uint16_t     height)
113 {
114     uint32_t    *dstLine, *dst;
115     uint32_t    *srcLine, *src;
116     int dstStride, srcStride;
117     uint16_t    w;
118     uint32_t component_half = 0x800080;
119     uint32_t upper_component_mask = 0xff00ff00;
120     uint32_t alpha_mask = 0xff;
121
122     fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
123     fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
124
125     while (height--)
126     {
127         dst = dstLine;
128         dstLine += dstStride;
129         src = srcLine;
130         srcLine += srcStride;
131         w = width;
132
133 //#define inner_branch
134         asm volatile (
135                         "cmp %[w], #0\n\t"
136                         "beq 2f\n\t"
137                         "1:\n\t"
138                         /* load src */
139                         "ldr r5, [%[src]], #4\n\t"
140 #ifdef inner_branch
141                         /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
142                          * The 0x0 case also allows us to avoid doing an unecessary data
143                          * write which is more valuable so we only check for that */
144                         "cmp r5, #0\n\t"
145                         "beq 3f\n\t"
146
147                         /* = 255 - alpha */
148                         "sub r8, %[alpha_mask], r5, lsr #24\n\t"
149
150                         "ldr r4, [%[dest]] \n\t"
151
152 #else
153                         "ldr r4, [%[dest]] \n\t"
154
155                         /* = 255 - alpha */
156                         "sub r8, %[alpha_mask], r5, lsr #24\n\t"
157 #endif
158                         "uxtb16 r6, r4\n\t"
159                         "uxtb16 r7, r4, ror #8\n\t"
160
161                         /* multiply by 257 and divide by 65536 */
162                         "mla r6, r6, r8, %[component_half]\n\t"
163                         "mla r7, r7, r8, %[component_half]\n\t"
164
165                         "uxtab16 r6, r6, r6, ror #8\n\t"
166                         "uxtab16 r7, r7, r7, ror #8\n\t"
167
168                         /* recombine the 0xff00ff00 bytes of r6 and r7 */
169                         "and r7, r7, %[upper_component_mask]\n\t"
170                         "uxtab16 r6, r7, r6, ror #8\n\t"
171
172                         "uqadd8 r5, r6, r5\n\t"
173
174 #ifdef inner_branch
175                         "3:\n\t"
176
177 #endif
178                         "str r5, [%[dest]], #4\n\t"
179                         /* increment counter and jmp to top */
180                         "subs   %[w], %[w], #1\n\t"
181                         "bne    1b\n\t"
182                         "2:\n\t"
183                         : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
184                         : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
185                           [alpha_mask] "r" (alpha_mask)
186                         : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
187                         );
188     }
189 }
190
191 void
192 fbCompositeSrc_8888x8x8888arm (pixman_op_t op,
193                                pixman_image_t * pSrc,
194                                pixman_image_t * pMask,
195                                pixman_image_t * pDst,
196                                int16_t  xSrc,
197                                int16_t  ySrc,
198                                int16_t      xMask,
199                                int16_t      yMask,
200                                int16_t      xDst,
201                                int16_t      yDst,
202                                uint16_t     width,
203                                uint16_t     height)
204 {
205     uint32_t    *dstLine, *dst;
206     uint32_t    *srcLine, *src;
207     uint32_t    mask;
208     int dstStride, srcStride;
209     uint16_t    w;
210     uint32_t component_half = 0x800080;
211     uint32_t alpha_mask = 0xff;
212
213     fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
214     fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
215
216     fbComposeGetSolid (pMask, mask, pDst->bits.format);
217     mask = (mask) >> 24;
218
219     while (height--)
220     {
221         dst = dstLine;
222         dstLine += dstStride;
223         src = srcLine;
224         srcLine += srcStride;
225         w = width;
226
227 //#define inner_branch
228         asm volatile (
229                         "cmp %[w], #0\n\t"
230                         "beq 2f\n\t"
231                         "1:\n\t"
232                         /* load src */
233                         "ldr r5, [%[src]], #4\n\t"
234 #ifdef inner_branch
235                         /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
236                          * The 0x0 case also allows us to avoid doing an unecessary data
237                          * write which is more valuable so we only check for that */
238                         "cmp r5, #0\n\t"
239                         "beq 3f\n\t"
240
241 #endif
242                         "ldr r4, [%[dest]] \n\t"
243
244                         "uxtb16 r6, r5\n\t"
245                         "uxtb16 r7, r5, ror #8\n\t"
246
247                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
248                         "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
249                         "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
250
251                         "uxtab16 r6, r6, r6, ror #8\n\t"
252                         "uxtab16 r7, r7, r7, ror #8\n\t"
253
254                         "uxtb16 r6, r6, ror #8\n\t"
255                         "uxtb16 r7, r7, ror #8\n\t"
256
257                         /* recombine */
258                         "orr r5, r6, r7, lsl #8\n\t"
259
260                         "uxtb16 r6, r4\n\t"
261                         "uxtb16 r7, r4, ror #8\n\t"
262
263                         /* 255 - alpha */
264                         "sub r8, %[alpha_mask], r5, lsr #24\n\t"
265
266                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
267                         "mla r6, r6, r8, %[component_half]\n\t"
268                         "mla r7, r7, r8, %[component_half]\n\t"
269
270                         "uxtab16 r6, r6, r6, ror #8\n\t"
271                         "uxtab16 r7, r7, r7, ror #8\n\t"
272
273                         "uxtb16 r6, r6, ror #8\n\t"
274                         "uxtb16 r7, r7, ror #8\n\t"
275
276                         /* recombine */
277                         "orr r6, r6, r7, lsl #8\n\t"
278
279                         "uqadd8 r5, r6, r5\n\t"
280
281 #ifdef inner_branch
282                         "3:\n\t"
283
284 #endif
285                         "str r5, [%[dest]], #4\n\t"
286                         /* increment counter and jmp to top */
287                         "subs   %[w], %[w], #1\n\t"
288                         "bne    1b\n\t"
289                         "2:\n\t"
290                         : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
291                         : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
292                           [alpha_mask] "r" (alpha_mask)
293                         : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
294                         );
295     }
296 }
297
298 void
299 fbCompositeSolidMask_nx8x8888arm (pixman_op_t      op,
300                                pixman_image_t * pSrc,
301                                pixman_image_t * pMask,
302                                pixman_image_t * pDst,
303                                int16_t      xSrc,
304                                int16_t      ySrc,
305                                int16_t      xMask,
306                                int16_t      yMask,
307                                int16_t      xDst,
308                                int16_t      yDst,
309                                uint16_t     width,
310                                uint16_t     height)
311 {
312     uint32_t     src, srca;
313     uint32_t    *dstLine, *dst;
314     uint8_t     *maskLine, *mask;
315     int          dstStride, maskStride;
316     uint16_t     w;
317
318     fbComposeGetSolid(pSrc, src, pDst->bits.format);
319
320     srca = src >> 24;
321     if (src == 0)
322         return;
323
324     uint32_t component_mask = 0xff00ff;
325     uint32_t component_half = 0x800080;
326
327     uint32_t src_hi = (src >> 8) & component_mask;
328     uint32_t src_lo = src & component_mask;
329
330     fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
331     fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
332
333     while (height--)
334     {
335         dst = dstLine;
336         dstLine += dstStride;
337         mask = maskLine;
338         maskLine += maskStride;
339         w = width;
340
341 //#define inner_branch
342         asm volatile (
343                         "cmp %[w], #0\n\t"
344                         "beq 2f\n\t"
345                         "1:\n\t"
346                         /* load mask */
347                         "ldrb r5, [%[mask]], #1\n\t"
348 #ifdef inner_branch
349                         /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
350                          * The 0x0 case also allows us to avoid doing an unecessary data
351                          * write which is more valuable so we only check for that */
352                         "cmp r5, #0\n\t"
353                         "beq 3f\n\t"
354
355 #endif
356                         "ldr r4, [%[dest]] \n\t"
357
358                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
359                         "mla r6, %[src_lo], r5, %[component_half]\n\t"
360                         "mla r7, %[src_hi], r5, %[component_half]\n\t"
361
362                         "uxtab16 r6, r6, r6, ror #8\n\t"
363                         "uxtab16 r7, r7, r7, ror #8\n\t"
364
365                         "uxtb16 r6, r6, ror #8\n\t"
366                         "uxtb16 r7, r7, ror #8\n\t"
367
368                         /* recombine */
369                         "orr r5, r6, r7, lsl #8\n\t"
370
371                         "uxtb16 r6, r4\n\t"
372                         "uxtb16 r7, r4, ror #8\n\t"
373
374                         /* we could simplify this to use 'sub' if we were
375                          * willing to give up a register for alpha_mask */
376                         "mvn r8, r5\n\t"
377                         "mov r8, r8, lsr #24\n\t"
378
379                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
380                         "mla r6, r6, r8, %[component_half]\n\t"
381                         "mla r7, r7, r8, %[component_half]\n\t"
382
383                         "uxtab16 r6, r6, r6, ror #8\n\t"
384                         "uxtab16 r7, r7, r7, ror #8\n\t"
385
386                         "uxtb16 r6, r6, ror #8\n\t"
387                         "uxtb16 r7, r7, ror #8\n\t"
388
389                         /* recombine */
390                         "orr r6, r6, r7, lsl #8\n\t"
391
392                         "uqadd8 r5, r6, r5\n\t"
393
394 #ifdef inner_branch
395                         "3:\n\t"
396
397 #endif
398                         "str r5, [%[dest]], #4\n\t"
399                         /* increment counter and jmp to top */
400                         "subs   %[w], %[w], #1\n\t"
401                         "bne    1b\n\t"
402                         "2:\n\t"
403                         : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
404                         : [component_half] "r" (component_half),
405                           [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
406                         : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
407                         );
408     }
409 }
410
411 static const FastPathInfo arm_simd_fast_path_array[] =
412 {
413     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8888arm,      0 },
414     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8888arm,      0 },
415     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, fbCompositeSrc_8888x8888arm,      0 },
416     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, fbCompositeSrc_8888x8888arm,      0 },
417     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8x8888arm,    NEED_SOLID_MASK },
418     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8x8888arm,    NEED_SOLID_MASK },
419
420     { PIXMAN_OP_ADD, PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       fbCompositeSrcAdd_8000x8000arm,   0 },
421
422     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSolidMask_nx8x8888arm,     0 },
423     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8x8888arm,     0 },
424     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8x8888arm,     0 },
425     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, fbCompositeSolidMask_nx8x8888arm,     0 },
426
427     { PIXMAN_OP_NONE },
428 };
429
430 const FastPathInfo *const arm_simd_fast_paths = arm_simd_fast_path_array;