Rename FastPathInfo to pixman_fast_path_t
[profile/ivi/pixman.git] / pixman / pixman-arm-simd.c
1 /*
2  * Copyright © 2008 Mozilla Corporation
3  *
4  * Permission to use, copy, modify, distribute, and sell this software and its
5  * documentation for any purpose is hereby granted without fee, provided that
6  * the above copyright notice appear in all copies and that both that
7  * copyright notice and this permission notice appear in supporting
8  * documentation, and that the name of Mozilla Corporation not be used in
9  * advertising or publicity pertaining to distribution of the software without
10  * specific, written prior permission.  Mozilla Corporation makes no
11  * representations about the suitability of this software for any purpose.  It
12  * is provided "as is" without express or implied warranty.
13  *
14  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
15  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
16  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
17  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
19  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
20  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
21  * SOFTWARE.
22  *
23  * Author:  Jeff Muizelaar (jeff@infidigm.net)
24  *
25  */
26 #ifdef HAVE_CONFIG_H
27 #include <config.h>
28 #endif
29
30 #include "pixman-arm-simd.h"
31
32 void
33 fbCompositeSrcAdd_8000x8000arm (
34                             pixman_implementation_t * impl,
35                             pixman_op_t op,
36                                 pixman_image_t * pSrc,
37                                 pixman_image_t * pMask,
38                                 pixman_image_t * pDst,
39                                 int32_t      xSrc,
40                                 int32_t      ySrc,
41                                 int32_t      xMask,
42                                 int32_t      yMask,
43                                 int32_t      xDst,
44                                 int32_t      yDst,
45                                 int32_t      width,
46                                 int32_t      height)
47 {
48     uint8_t     *dstLine, *dst;
49     uint8_t     *srcLine, *src;
50     int dstStride, srcStride;
51     uint16_t    w;
52     uint8_t     s, d;
53
54     fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
55     fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
56
57     while (height--)
58     {
59         dst = dstLine;
60         dstLine += dstStride;
61         src = srcLine;
62         srcLine += srcStride;
63         w = width;
64
65         /* ensure both src and dst are properly aligned before doing 32 bit reads
66          * we'll stay in this loop if src and dst have differing alignments */
67         while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
68         {
69             s = *src;
70             d = *dst;
71             asm("uqadd8 %0, %1, %2" : "+r"(d) : "r"(s));
72             *dst = d;
73
74             dst++;
75             src++;
76             w--;
77         }
78
79         while (w >= 4)
80         {
81             asm("uqadd8 %0, %1, %2" : "=r"(*(uint32_t*)dst) : "r"(*(uint32_t*)src), "r"(*(uint32_t*)dst));
82             dst += 4;
83             src += 4;
84             w -= 4;
85         }
86
87         while (w)
88         {
89             s = *src;
90             d = *dst;
91             asm("uqadd8 %0, %1, %2" : "+r"(d) : "r"(s));
92             *dst = d;
93
94             dst++;
95             src++;
96             w--;
97         }
98     }
99
100 }
101
102 void
103 fbCompositeSrc_8888x8888arm (
104                             pixman_implementation_t * impl,
105                             pixman_op_t op,
106                          pixman_image_t * pSrc,
107                          pixman_image_t * pMask,
108                          pixman_image_t * pDst,
109                          int32_t      xSrc,
110                          int32_t      ySrc,
111                          int32_t      xMask,
112                          int32_t      yMask,
113                          int32_t      xDst,
114                          int32_t      yDst,
115                          int32_t      width,
116                          int32_t      height)
117 {
118     uint32_t    *dstLine, *dst;
119     uint32_t    *srcLine, *src;
120     int dstStride, srcStride;
121     uint16_t    w;
122     uint32_t component_half = 0x800080;
123     uint32_t upper_component_mask = 0xff00ff00;
124     uint32_t alpha_mask = 0xff;
125
126     fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
127     fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
128
129     while (height--)
130     {
131         dst = dstLine;
132         dstLine += dstStride;
133         src = srcLine;
134         srcLine += srcStride;
135         w = width;
136
137 //#define inner_branch
138         asm volatile (
139                         "cmp %[w], #0\n\t"
140                         "beq 2f\n\t"
141                         "1:\n\t"
142                         /* load src */
143                         "ldr r5, [%[src]], #4\n\t"
144 #ifdef inner_branch
145                         /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
146                          * The 0x0 case also allows us to avoid doing an unecessary data
147                          * write which is more valuable so we only check for that */
148                         "cmp r5, #0\n\t"
149                         "beq 3f\n\t"
150
151                         /* = 255 - alpha */
152                         "sub r8, %[alpha_mask], r5, lsr #24\n\t"
153
154                         "ldr r4, [%[dest]] \n\t"
155
156 #else
157                         "ldr r4, [%[dest]] \n\t"
158
159                         /* = 255 - alpha */
160                         "sub r8, %[alpha_mask], r5, lsr #24\n\t"
161 #endif
162                         "uxtb16 r6, r4\n\t"
163                         "uxtb16 r7, r4, ror #8\n\t"
164
165                         /* multiply by 257 and divide by 65536 */
166                         "mla r6, r6, r8, %[component_half]\n\t"
167                         "mla r7, r7, r8, %[component_half]\n\t"
168
169                         "uxtab16 r6, r6, r6, ror #8\n\t"
170                         "uxtab16 r7, r7, r7, ror #8\n\t"
171
172                         /* recombine the 0xff00ff00 bytes of r6 and r7 */
173                         "and r7, r7, %[upper_component_mask]\n\t"
174                         "uxtab16 r6, r7, r6, ror #8\n\t"
175
176                         "uqadd8 r5, r6, r5\n\t"
177
178 #ifdef inner_branch
179                         "3:\n\t"
180
181 #endif
182                         "str r5, [%[dest]], #4\n\t"
183                         /* increment counter and jmp to top */
184                         "subs   %[w], %[w], #1\n\t"
185                         "bne    1b\n\t"
186                         "2:\n\t"
187                         : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
188                         : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
189                           [alpha_mask] "r" (alpha_mask)
190                         : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
191                         );
192     }
193 }
194
195 void
196 fbCompositeSrc_8888x8x8888arm (
197                             pixman_implementation_t * impl,
198                             pixman_op_t op,
199                                pixman_image_t * pSrc,
200                                pixman_image_t * pMask,
201                                pixman_image_t * pDst,
202                                int32_t  xSrc,
203                                int32_t  ySrc,
204                                int32_t      xMask,
205                                int32_t      yMask,
206                                int32_t      xDst,
207                                int32_t      yDst,
208                                int32_t      width,
209                                int32_t      height)
210 {
211     uint32_t    *dstLine, *dst;
212     uint32_t    *srcLine, *src;
213     uint32_t    mask;
214     int dstStride, srcStride;
215     uint16_t    w;
216     uint32_t component_half = 0x800080;
217     uint32_t alpha_mask = 0xff;
218
219     fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
220     fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
221
222     fbComposeGetSolid (pMask, mask, pDst->bits.format);
223     mask = (mask) >> 24;
224
225     while (height--)
226     {
227         dst = dstLine;
228         dstLine += dstStride;
229         src = srcLine;
230         srcLine += srcStride;
231         w = width;
232
233 //#define inner_branch
234         asm volatile (
235                         "cmp %[w], #0\n\t"
236                         "beq 2f\n\t"
237                         "1:\n\t"
238                         /* load src */
239                         "ldr r5, [%[src]], #4\n\t"
240 #ifdef inner_branch
241                         /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
242                          * The 0x0 case also allows us to avoid doing an unecessary data
243                          * write which is more valuable so we only check for that */
244                         "cmp r5, #0\n\t"
245                         "beq 3f\n\t"
246
247 #endif
248                         "ldr r4, [%[dest]] \n\t"
249
250                         "uxtb16 r6, r5\n\t"
251                         "uxtb16 r7, r5, ror #8\n\t"
252
253                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
254                         "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
255                         "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
256
257                         "uxtab16 r6, r6, r6, ror #8\n\t"
258                         "uxtab16 r7, r7, r7, ror #8\n\t"
259
260                         "uxtb16 r6, r6, ror #8\n\t"
261                         "uxtb16 r7, r7, ror #8\n\t"
262
263                         /* recombine */
264                         "orr r5, r6, r7, lsl #8\n\t"
265
266                         "uxtb16 r6, r4\n\t"
267                         "uxtb16 r7, r4, ror #8\n\t"
268
269                         /* 255 - alpha */
270                         "sub r8, %[alpha_mask], r5, lsr #24\n\t"
271
272                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
273                         "mla r6, r6, r8, %[component_half]\n\t"
274                         "mla r7, r7, r8, %[component_half]\n\t"
275
276                         "uxtab16 r6, r6, r6, ror #8\n\t"
277                         "uxtab16 r7, r7, r7, ror #8\n\t"
278
279                         "uxtb16 r6, r6, ror #8\n\t"
280                         "uxtb16 r7, r7, ror #8\n\t"
281
282                         /* recombine */
283                         "orr r6, r6, r7, lsl #8\n\t"
284
285                         "uqadd8 r5, r6, r5\n\t"
286
287 #ifdef inner_branch
288                         "3:\n\t"
289
290 #endif
291                         "str r5, [%[dest]], #4\n\t"
292                         /* increment counter and jmp to top */
293                         "subs   %[w], %[w], #1\n\t"
294                         "bne    1b\n\t"
295                         "2:\n\t"
296                         : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
297                         : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
298                           [alpha_mask] "r" (alpha_mask)
299                         : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
300                         );
301     }
302 }
303
304 void
305 fbCompositeSolidMask_nx8x8888arm (
306                             pixman_implementation_t * impl,
307                             pixman_op_t      op,
308                                pixman_image_t * pSrc,
309                                pixman_image_t * pMask,
310                                pixman_image_t * pDst,
311                                int32_t      xSrc,
312                                int32_t      ySrc,
313                                int32_t      xMask,
314                                int32_t      yMask,
315                                int32_t      xDst,
316                                int32_t      yDst,
317                                int32_t      width,
318                                int32_t      height)
319 {
320     uint32_t     src, srca;
321     uint32_t    *dstLine, *dst;
322     uint8_t     *maskLine, *mask;
323     int          dstStride, maskStride;
324     uint16_t     w;
325
326     fbComposeGetSolid(pSrc, src, pDst->bits.format);
327
328     srca = src >> 24;
329     if (src == 0)
330         return;
331
332     uint32_t component_mask = 0xff00ff;
333     uint32_t component_half = 0x800080;
334
335     uint32_t src_hi = (src >> 8) & component_mask;
336     uint32_t src_lo = src & component_mask;
337
338     fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
339     fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
340
341     while (height--)
342     {
343         dst = dstLine;
344         dstLine += dstStride;
345         mask = maskLine;
346         maskLine += maskStride;
347         w = width;
348
349 //#define inner_branch
350         asm volatile (
351                         "cmp %[w], #0\n\t"
352                         "beq 2f\n\t"
353                         "1:\n\t"
354                         /* load mask */
355                         "ldrb r5, [%[mask]], #1\n\t"
356 #ifdef inner_branch
357                         /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
358                          * The 0x0 case also allows us to avoid doing an unecessary data
359                          * write which is more valuable so we only check for that */
360                         "cmp r5, #0\n\t"
361                         "beq 3f\n\t"
362
363 #endif
364                         "ldr r4, [%[dest]] \n\t"
365
366                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
367                         "mla r6, %[src_lo], r5, %[component_half]\n\t"
368                         "mla r7, %[src_hi], r5, %[component_half]\n\t"
369
370                         "uxtab16 r6, r6, r6, ror #8\n\t"
371                         "uxtab16 r7, r7, r7, ror #8\n\t"
372
373                         "uxtb16 r6, r6, ror #8\n\t"
374                         "uxtb16 r7, r7, ror #8\n\t"
375
376                         /* recombine */
377                         "orr r5, r6, r7, lsl #8\n\t"
378
379                         "uxtb16 r6, r4\n\t"
380                         "uxtb16 r7, r4, ror #8\n\t"
381
382                         /* we could simplify this to use 'sub' if we were
383                          * willing to give up a register for alpha_mask */
384                         "mvn r8, r5\n\t"
385                         "mov r8, r8, lsr #24\n\t"
386
387                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
388                         "mla r6, r6, r8, %[component_half]\n\t"
389                         "mla r7, r7, r8, %[component_half]\n\t"
390
391                         "uxtab16 r6, r6, r6, ror #8\n\t"
392                         "uxtab16 r7, r7, r7, ror #8\n\t"
393
394                         "uxtb16 r6, r6, ror #8\n\t"
395                         "uxtb16 r7, r7, ror #8\n\t"
396
397                         /* recombine */
398                         "orr r6, r6, r7, lsl #8\n\t"
399
400                         "uqadd8 r5, r6, r5\n\t"
401
402 #ifdef inner_branch
403                         "3:\n\t"
404
405 #endif
406                         "str r5, [%[dest]], #4\n\t"
407                         /* increment counter and jmp to top */
408                         "subs   %[w], %[w], #1\n\t"
409                         "bne    1b\n\t"
410                         "2:\n\t"
411                         : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
412                         : [component_half] "r" (component_half),
413                           [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
414                         : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
415                         );
416     }
417 }
418
419 static const pixman_fast_path_t arm_simd_fast_path_array[] =
420 {
421     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8888arm,      0 },
422     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8888arm,      0 },
423     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, fbCompositeSrc_8888x8888arm,      0 },
424     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, fbCompositeSrc_8888x8888arm,      0 },
425     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8x8888arm,    NEED_SOLID_MASK },
426     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8x8888arm,    NEED_SOLID_MASK },
427
428     { PIXMAN_OP_ADD, PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       fbCompositeSrcAdd_8000x8000arm,   0 },
429
430     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSolidMask_nx8x8888arm,     0 },
431     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8x8888arm,     0 },
432     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8x8888arm,     0 },
433     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, fbCompositeSolidMask_nx8x8888arm,     0 },
434
435     { PIXMAN_OP_NONE },
436 };
437
438 const pixman_fast_path_t *const arm_simd_fast_paths = arm_simd_fast_path_array;
439
440 static void
441 arm_simd_composite (pixman_implementation_t *imp,
442                 pixman_op_t     op,
443                 pixman_image_t *src,
444                 pixman_image_t *mask,
445                 pixman_image_t *dest,
446                 int32_t         src_x,
447                 int32_t         src_y,
448                 int32_t         mask_x,
449                 int32_t         mask_y,
450                 int32_t         dest_x,
451                 int32_t         dest_y,
452                 int32_t        width,
453                 int32_t        height)
454 {
455     if (_pixman_run_fast_path (arm_simd_fast_paths, imp,
456                                op, src, mask, dest,
457                                src_x, src_y,
458                                mask_x, mask_y,
459                                dest_x, dest_y,
460                                width, height))
461     {
462         return;
463     }
464
465     _pixman_implementation_composite (imp->delegate, op,
466                                       src, mask, dest,
467                                       src_x, src_y,
468                                       mask_x, mask_y,
469                                       dest_x, dest_y,
470                                       width, height);
471 }
472
473 pixman_implementation_t *
474 _pixman_implementation_create_arm_simd (void)
475 {
476     pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
477     pixman_implementation_t *imp = _pixman_implementation_create (general);
478
479     imp->composite = arm_simd_composite;
480
481     return imp;
482 }