Change the name of some routines that were simply misnamed.
[profile/ivi/pixman.git] / pixman / pixman-arm-simd.c
1 /*
2  * Copyright © 2008 Mozilla Corporation
3  *
4  * Permission to use, copy, modify, distribute, and sell this software and its
5  * documentation for any purpose is hereby granted without fee, provided that
6  * the above copyright notice appear in all copies and that both that
7  * copyright notice and this permission notice appear in supporting
8  * documentation, and that the name of Mozilla Corporation not be used in
9  * advertising or publicity pertaining to distribution of the software without
10  * specific, written prior permission.  Mozilla Corporation makes no
11  * representations about the suitability of this software for any purpose.  It
12  * is provided "as is" without express or implied warranty.
13  *
14  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
15  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
16  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
17  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
19  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
20  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
21  * SOFTWARE.
22  *
23  * Author:  Jeff Muizelaar (jeff@infidigm.net)
24  *
25  */
26 #ifdef HAVE_CONFIG_H
27 #include <config.h>
28 #endif
29
30 #include "pixman-private.h"
31
32 static void
33 arm_CompositeAdd_8000_8000 (
34                             pixman_implementation_t * impl,
35                             pixman_op_t op,
36                                 pixman_image_t * pSrc,
37                                 pixman_image_t * pMask,
38                                 pixman_image_t * pDst,
39                                 int32_t      xSrc,
40                                 int32_t      ySrc,
41                                 int32_t      xMask,
42                                 int32_t      yMask,
43                                 int32_t      xDst,
44                                 int32_t      yDst,
45                                 int32_t      width,
46                                 int32_t      height)
47 {
48     uint8_t     *dstLine, *dst;
49     uint8_t     *srcLine, *src;
50     int dstStride, srcStride;
51     uint16_t    w;
52     uint8_t     s, d;
53
54     fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
55     fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
56
57     while (height--)
58     {
59         dst = dstLine;
60         dstLine += dstStride;
61         src = srcLine;
62         srcLine += srcStride;
63         w = width;
64
65         /* ensure both src and dst are properly aligned before doing 32 bit reads
66          * we'll stay in this loop if src and dst have differing alignments */
67         while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
68         {
69             s = *src;
70             d = *dst;
71             asm("uqadd8 %0, %1, %2" : "+r"(d) : "r"(s));
72             *dst = d;
73
74             dst++;
75             src++;
76             w--;
77         }
78
79         while (w >= 4)
80         {
81             asm("uqadd8 %0, %1, %2" : "=r"(*(uint32_t*)dst) : "r"(*(uint32_t*)src), "r"(*(uint32_t*)dst));
82             dst += 4;
83             src += 4;
84             w -= 4;
85         }
86
87         while (w)
88         {
89             s = *src;
90             d = *dst;
91             asm("uqadd8 %0, %1, %2" : "+r"(d) : "r"(s));
92             *dst = d;
93
94             dst++;
95             src++;
96             w--;
97         }
98     }
99
100 }
101
102 static void
103 arm_composite_over_8888_8888 (
104                             pixman_implementation_t * impl,
105                             pixman_op_t op,
106                          pixman_image_t * pSrc,
107                          pixman_image_t * pMask,
108                          pixman_image_t * pDst,
109                          int32_t      xSrc,
110                          int32_t      ySrc,
111                          int32_t      xMask,
112                          int32_t      yMask,
113                          int32_t      xDst,
114                          int32_t      yDst,
115                          int32_t      width,
116                          int32_t      height)
117 {
118     uint32_t    *dstLine, *dst;
119     uint32_t    *srcLine, *src;
120     int dstStride, srcStride;
121     uint16_t    w;
122     uint32_t component_half = 0x800080;
123     uint32_t upper_component_mask = 0xff00ff00;
124     uint32_t alpha_mask = 0xff;
125
126     fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
127     fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
128
129     while (height--)
130     {
131         dst = dstLine;
132         dstLine += dstStride;
133         src = srcLine;
134         srcLine += srcStride;
135         w = width;
136
137 //#define inner_branch
138         asm volatile (
139                         "cmp %[w], #0\n\t"
140                         "beq 2f\n\t"
141                         "1:\n\t"
142                         /* load src */
143                         "ldr r5, [%[src]], #4\n\t"
144 #ifdef inner_branch
145                         /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
146                          * The 0x0 case also allows us to avoid doing an unecessary data
147                          * write which is more valuable so we only check for that */
148                         "cmp r5, #0\n\t"
149                         "beq 3f\n\t"
150
151                         /* = 255 - alpha */
152                         "sub r8, %[alpha_mask], r5, lsr #24\n\t"
153
154                         "ldr r4, [%[dest]] \n\t"
155
156 #else
157                         "ldr r4, [%[dest]] \n\t"
158
159                         /* = 255 - alpha */
160                         "sub r8, %[alpha_mask], r5, lsr #24\n\t"
161 #endif
162                         "uxtb16 r6, r4\n\t"
163                         "uxtb16 r7, r4, ror #8\n\t"
164
165                         /* multiply by 257 and divide by 65536 */
166                         "mla r6, r6, r8, %[component_half]\n\t"
167                         "mla r7, r7, r8, %[component_half]\n\t"
168
169                         "uxtab16 r6, r6, r6, ror #8\n\t"
170                         "uxtab16 r7, r7, r7, ror #8\n\t"
171
172                         /* recombine the 0xff00ff00 bytes of r6 and r7 */
173                         "and r7, r7, %[upper_component_mask]\n\t"
174                         "uxtab16 r6, r7, r6, ror #8\n\t"
175
176                         "uqadd8 r5, r6, r5\n\t"
177
178 #ifdef inner_branch
179                         "3:\n\t"
180
181 #endif
182                         "str r5, [%[dest]], #4\n\t"
183                         /* increment counter and jmp to top */
184                         "subs   %[w], %[w], #1\n\t"
185                         "bne    1b\n\t"
186                         "2:\n\t"
187                         : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
188                         : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
189                           [alpha_mask] "r" (alpha_mask)
190                         : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
191                         );
192     }
193 }
194
195 static void
196 arm_composite_over_8888_n_8888 (
197                             pixman_implementation_t * impl,
198                             pixman_op_t op,
199                                pixman_image_t * pSrc,
200                                pixman_image_t * pMask,
201                                pixman_image_t * pDst,
202                                int32_t  xSrc,
203                                int32_t  ySrc,
204                                int32_t      xMask,
205                                int32_t      yMask,
206                                int32_t      xDst,
207                                int32_t      yDst,
208                                int32_t      width,
209                                int32_t      height)
210 {
211     uint32_t    *dstLine, *dst;
212     uint32_t    *srcLine, *src;
213     uint32_t    mask;
214     int dstStride, srcStride;
215     uint16_t    w;
216     uint32_t component_half = 0x800080;
217     uint32_t alpha_mask = 0xff;
218
219     fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
220     fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
221
222     mask = _pixman_image_get_solid (pMask, pDst->bits.format);
223     mask = (mask) >> 24;
224
225     while (height--)
226     {
227         dst = dstLine;
228         dstLine += dstStride;
229         src = srcLine;
230         srcLine += srcStride;
231         w = width;
232
233 //#define inner_branch
234         asm volatile (
235                         "cmp %[w], #0\n\t"
236                         "beq 2f\n\t"
237                         "1:\n\t"
238                         /* load src */
239                         "ldr r5, [%[src]], #4\n\t"
240 #ifdef inner_branch
241                         /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
242                          * The 0x0 case also allows us to avoid doing an unecessary data
243                          * write which is more valuable so we only check for that */
244                         "cmp r5, #0\n\t"
245                         "beq 3f\n\t"
246
247 #endif
248                         "ldr r4, [%[dest]] \n\t"
249
250                         "uxtb16 r6, r5\n\t"
251                         "uxtb16 r7, r5, ror #8\n\t"
252
253                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
254                         "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
255                         "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
256
257                         "uxtab16 r6, r6, r6, ror #8\n\t"
258                         "uxtab16 r7, r7, r7, ror #8\n\t"
259
260                         "uxtb16 r6, r6, ror #8\n\t"
261                         "uxtb16 r7, r7, ror #8\n\t"
262
263                         /* recombine */
264                         "orr r5, r6, r7, lsl #8\n\t"
265
266                         "uxtb16 r6, r4\n\t"
267                         "uxtb16 r7, r4, ror #8\n\t"
268
269                         /* 255 - alpha */
270                         "sub r8, %[alpha_mask], r5, lsr #24\n\t"
271
272                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
273                         "mla r6, r6, r8, %[component_half]\n\t"
274                         "mla r7, r7, r8, %[component_half]\n\t"
275
276                         "uxtab16 r6, r6, r6, ror #8\n\t"
277                         "uxtab16 r7, r7, r7, ror #8\n\t"
278
279                         "uxtb16 r6, r6, ror #8\n\t"
280                         "uxtb16 r7, r7, ror #8\n\t"
281
282                         /* recombine */
283                         "orr r6, r6, r7, lsl #8\n\t"
284
285                         "uqadd8 r5, r6, r5\n\t"
286
287 #ifdef inner_branch
288                         "3:\n\t"
289
290 #endif
291                         "str r5, [%[dest]], #4\n\t"
292                         /* increment counter and jmp to top */
293                         "subs   %[w], %[w], #1\n\t"
294                         "bne    1b\n\t"
295                         "2:\n\t"
296                         : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
297                         : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
298                           [alpha_mask] "r" (alpha_mask)
299                         : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
300                         );
301     }
302 }
303
304 static void
305 arm_CompositeOver_n_8_8888 (
306                             pixman_implementation_t * impl,
307                             pixman_op_t      op,
308                                pixman_image_t * pSrc,
309                                pixman_image_t * pMask,
310                                pixman_image_t * pDst,
311                                int32_t      xSrc,
312                                int32_t      ySrc,
313                                int32_t      xMask,
314                                int32_t      yMask,
315                                int32_t      xDst,
316                                int32_t      yDst,
317                                int32_t      width,
318                                int32_t      height)
319 {
320     uint32_t     src, srca;
321     uint32_t    *dstLine, *dst;
322     uint8_t     *maskLine, *mask;
323     int          dstStride, maskStride;
324     uint16_t     w;
325
326     src = _pixman_image_get_solid(pSrc, pDst->bits.format);
327
328     // bail out if fully transparent
329     srca = src >> 24;
330     if (src == 0)
331         return;
332
333     uint32_t component_mask = 0xff00ff;
334     uint32_t component_half = 0x800080;
335
336     uint32_t src_hi = (src >> 8) & component_mask;
337     uint32_t src_lo = src & component_mask;
338
339     fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
340     fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
341
342     while (height--)
343     {
344         dst = dstLine;
345         dstLine += dstStride;
346         mask = maskLine;
347         maskLine += maskStride;
348         w = width;
349
350 //#define inner_branch
351         asm volatile (
352                         "cmp %[w], #0\n\t"
353                         "beq 2f\n\t"
354                         "1:\n\t"
355                         /* load mask */
356                         "ldrb r5, [%[mask]], #1\n\t"
357 #ifdef inner_branch
358                         /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
359                          * The 0x0 case also allows us to avoid doing an unecessary data
360                          * write which is more valuable so we only check for that */
361                         "cmp r5, #0\n\t"
362                         "beq 3f\n\t"
363
364 #endif
365                         "ldr r4, [%[dest]] \n\t"
366
367                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
368                         "mla r6, %[src_lo], r5, %[component_half]\n\t"
369                         "mla r7, %[src_hi], r5, %[component_half]\n\t"
370
371                         "uxtab16 r6, r6, r6, ror #8\n\t"
372                         "uxtab16 r7, r7, r7, ror #8\n\t"
373
374                         "uxtb16 r6, r6, ror #8\n\t"
375                         "uxtb16 r7, r7, ror #8\n\t"
376
377                         /* recombine */
378                         "orr r5, r6, r7, lsl #8\n\t"
379
380                         "uxtb16 r6, r4\n\t"
381                         "uxtb16 r7, r4, ror #8\n\t"
382
383                         /* we could simplify this to use 'sub' if we were
384                          * willing to give up a register for alpha_mask */
385                         "mvn r8, r5\n\t"
386                         "mov r8, r8, lsr #24\n\t"
387
388                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
389                         "mla r6, r6, r8, %[component_half]\n\t"
390                         "mla r7, r7, r8, %[component_half]\n\t"
391
392                         "uxtab16 r6, r6, r6, ror #8\n\t"
393                         "uxtab16 r7, r7, r7, ror #8\n\t"
394
395                         "uxtb16 r6, r6, ror #8\n\t"
396                         "uxtb16 r7, r7, ror #8\n\t"
397
398                         /* recombine */
399                         "orr r6, r6, r7, lsl #8\n\t"
400
401                         "uqadd8 r5, r6, r5\n\t"
402
403 #ifdef inner_branch
404                         "3:\n\t"
405
406 #endif
407                         "str r5, [%[dest]], #4\n\t"
408                         /* increment counter and jmp to top */
409                         "subs   %[w], %[w], #1\n\t"
410                         "bne    1b\n\t"
411                         "2:\n\t"
412                         : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
413                         : [component_half] "r" (component_half),
414                           [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
415                         : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
416                         );
417     }
418 }
419
420 static const pixman_fast_path_t arm_simd_fast_path_array[] =
421 {
422     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, arm_composite_over_8888_8888,      0 },
423     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, arm_composite_over_8888_8888,     0 },
424     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, arm_composite_over_8888_8888,     0 },
425     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, arm_composite_over_8888_8888,     0 },
426     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, arm_composite_over_8888_n_8888,    NEED_SOLID_MASK },
427     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, arm_composite_over_8888_n_8888,           NEED_SOLID_MASK },
428
429     { PIXMAN_OP_ADD, PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       arm_CompositeAdd_8000_8000,   0 },
430
431     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, arm_CompositeOver_n_8_8888,     0 },
432     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, arm_CompositeOver_n_8_8888,     0 },
433     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, arm_CompositeOver_n_8_8888,     0 },
434     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, arm_CompositeOver_n_8_8888,     0 },
435
436     { PIXMAN_OP_NONE },
437 };
438
439 const pixman_fast_path_t *const arm_simd_fast_paths = arm_simd_fast_path_array;
440
441 static void
442 arm_simd_composite (pixman_implementation_t *imp,
443                 pixman_op_t     op,
444                 pixman_image_t *src,
445                 pixman_image_t *mask,
446                 pixman_image_t *dest,
447                 int32_t         src_x,
448                 int32_t         src_y,
449                 int32_t         mask_x,
450                 int32_t         mask_y,
451                 int32_t         dest_x,
452                 int32_t         dest_y,
453                 int32_t        width,
454                 int32_t        height)
455 {
456     if (_pixman_run_fast_path (arm_simd_fast_paths, imp,
457                                op, src, mask, dest,
458                                src_x, src_y,
459                                mask_x, mask_y,
460                                dest_x, dest_y,
461                                width, height))
462     {
463         return;
464     }
465
466     _pixman_implementation_composite (imp->delegate, op,
467                                       src, mask, dest,
468                                       src_x, src_y,
469                                       mask_x, mask_y,
470                                       dest_x, dest_y,
471                                       width, height);
472 }
473
474 pixman_implementation_t *
475 _pixman_implementation_create_arm_simd (void)
476 {
477     pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
478     pixman_implementation_t *imp = _pixman_implementation_create (general);
479
480     imp->composite = arm_simd_composite;
481
482     return imp;
483 }