Merge branch 'eliminate-composite'
[profile/ivi/pixman.git] / pixman / pixman-arm-simd.c
1 /*
2  * Copyright © 2008 Mozilla Corporation
3  *
4  * Permission to use, copy, modify, distribute, and sell this software and its
5  * documentation for any purpose is hereby granted without fee, provided that
6  * the above copyright notice appear in all copies and that both that
7  * copyright notice and this permission notice appear in supporting
8  * documentation, and that the name of Mozilla Corporation not be used in
9  * advertising or publicity pertaining to distribution of the software without
10  * specific, written prior permission.  Mozilla Corporation makes no
11  * representations about the suitability of this software for any purpose.  It
12  * is provided "as is" without express or implied warranty.
13  *
14  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
15  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
16  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
17  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
19  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
20  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
21  * SOFTWARE.
22  *
23  * Author:  Jeff Muizelaar (jeff@infidigm.net)
24  *
25  */
26 #ifdef HAVE_CONFIG_H
27 #include <config.h>
28 #endif
29
30 #include "pixman-private.h"
31
32 static void
33 arm_composite_add_8000_8000 (pixman_implementation_t * impl,
34                              pixman_op_t               op,
35                              pixman_image_t *          src_image,
36                              pixman_image_t *          mask_image,
37                              pixman_image_t *          dst_image,
38                              int32_t                   src_x,
39                              int32_t                   src_y,
40                              int32_t                   mask_x,
41                              int32_t                   mask_y,
42                              int32_t                   dest_x,
43                              int32_t                   dest_y,
44                              int32_t                   width,
45                              int32_t                   height)
46 {
47     uint8_t     *dst_line, *dst;
48     uint8_t     *src_line, *src;
49     int dst_stride, src_stride;
50     int32_t w;
51     uint8_t s, d;
52
53     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
54     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
55
56     while (height--)
57     {
58         dst = dst_line;
59         dst_line += dst_stride;
60         src = src_line;
61         src_line += src_stride;
62         w = width;
63
64         /* ensure both src and dst are properly aligned before doing 32 bit reads
65          * we'll stay in this loop if src and dst have differing alignments
66          */
67         while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
68         {
69             s = *src;
70             d = *dst;
71             asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
72             *dst = d;
73
74             dst++;
75             src++;
76             w--;
77         }
78
79         while (w >= 4)
80         {
81             asm ("uqadd8 %0, %1, %2"
82                  : "=r" (*(uint32_t*)dst)
83                  : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst));
84             dst += 4;
85             src += 4;
86             w -= 4;
87         }
88
89         while (w)
90         {
91             s = *src;
92             d = *dst;
93             asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
94             *dst = d;
95
96             dst++;
97             src++;
98             w--;
99         }
100     }
101
102 }
103
104 static void
105 arm_composite_over_8888_8888 (pixman_implementation_t * impl,
106                               pixman_op_t               op,
107                               pixman_image_t *          src_image,
108                               pixman_image_t *          mask_image,
109                               pixman_image_t *          dst_image,
110                               int32_t                   src_x,
111                               int32_t                   src_y,
112                               int32_t                   mask_x,
113                               int32_t                   mask_y,
114                               int32_t                   dest_x,
115                               int32_t                   dest_y,
116                               int32_t                   width,
117                               int32_t                   height)
118 {
119     uint32_t    *dst_line, *dst;
120     uint32_t    *src_line, *src;
121     int dst_stride, src_stride;
122     int32_t w;
123     uint32_t component_half = 0x800080;
124     uint32_t upper_component_mask = 0xff00ff00;
125     uint32_t alpha_mask = 0xff;
126
127     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
128     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
129
130     while (height--)
131     {
132         dst = dst_line;
133         dst_line += dst_stride;
134         src = src_line;
135         src_line += src_stride;
136         w = width;
137
138 /* #define inner_branch */
139         asm volatile (
140             "cmp %[w], #0\n\t"
141             "beq 2f\n\t"
142             "1:\n\t"
143             /* load src */
144             "ldr r5, [%[src]], #4\n\t"
145 #ifdef inner_branch
146             /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
147              * The 0x0 case also allows us to avoid doing an unecessary data
148              * write which is more valuable so we only check for that
149              */
150             "cmp r5, #0\n\t"
151             "beq 3f\n\t"
152
153             /* = 255 - alpha */
154             "sub r8, %[alpha_mask], r5, lsr #24\n\t"
155
156             "ldr r4, [%[dest]] \n\t"
157
158 #else
159             "ldr r4, [%[dest]] \n\t"
160
161             /* = 255 - alpha */
162             "sub r8, %[alpha_mask], r5, lsr #24\n\t"
163 #endif
164             "uxtb16 r6, r4\n\t"
165             "uxtb16 r7, r4, ror #8\n\t"
166
167             /* multiply by 257 and divide by 65536 */
168             "mla r6, r6, r8, %[component_half]\n\t"
169             "mla r7, r7, r8, %[component_half]\n\t"
170
171             "uxtab16 r6, r6, r6, ror #8\n\t"
172             "uxtab16 r7, r7, r7, ror #8\n\t"
173
174             /* recombine the 0xff00ff00 bytes of r6 and r7 */
175             "and r7, r7, %[upper_component_mask]\n\t"
176             "uxtab16 r6, r7, r6, ror #8\n\t"
177
178             "uqadd8 r5, r6, r5\n\t"
179
180 #ifdef inner_branch
181             "3:\n\t"
182
183 #endif
184             "str r5, [%[dest]], #4\n\t"
185             /* increment counter and jmp to top */
186             "subs       %[w], %[w], #1\n\t"
187             "bne        1b\n\t"
188             "2:\n\t"
189             : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
190             : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
191               [alpha_mask] "r" (alpha_mask)
192             : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
193             );
194     }
195 }
196
197 static void
198 arm_composite_over_8888_n_8888 (pixman_implementation_t * impl,
199                                 pixman_op_t               op,
200                                 pixman_image_t *          src_image,
201                                 pixman_image_t *          mask_image,
202                                 pixman_image_t *          dst_image,
203                                 int32_t                   src_x,
204                                 int32_t                   src_y,
205                                 int32_t                   mask_x,
206                                 int32_t                   mask_y,
207                                 int32_t                   dest_x,
208                                 int32_t                   dest_y,
209                                 int32_t                   width,
210                                 int32_t                   height)
211 {
212     uint32_t *dst_line, *dst;
213     uint32_t *src_line, *src;
214     uint32_t mask;
215     int dst_stride, src_stride;
216     int32_t w;
217     uint32_t component_half = 0x800080;
218     uint32_t alpha_mask = 0xff;
219
220     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
221     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
222
223     mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
224     mask = (mask) >> 24;
225
226     while (height--)
227     {
228         dst = dst_line;
229         dst_line += dst_stride;
230         src = src_line;
231         src_line += src_stride;
232         w = width;
233
234 /* #define inner_branch */
235         asm volatile (
236             "cmp %[w], #0\n\t"
237             "beq 2f\n\t"
238             "1:\n\t"
239             /* load src */
240             "ldr r5, [%[src]], #4\n\t"
241 #ifdef inner_branch
242             /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
243              * The 0x0 case also allows us to avoid doing an unecessary data
244              * write which is more valuable so we only check for that
245              */
246             "cmp r5, #0\n\t"
247             "beq 3f\n\t"
248
249 #endif
250             "ldr r4, [%[dest]] \n\t"
251
252             "uxtb16 r6, r5\n\t"
253             "uxtb16 r7, r5, ror #8\n\t"
254
255             /* multiply by alpha (r8) then by 257 and divide by 65536 */
256             "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
257             "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
258
259             "uxtab16 r6, r6, r6, ror #8\n\t"
260             "uxtab16 r7, r7, r7, ror #8\n\t"
261
262             "uxtb16 r6, r6, ror #8\n\t"
263             "uxtb16 r7, r7, ror #8\n\t"
264
265             /* recombine */
266             "orr r5, r6, r7, lsl #8\n\t"
267
268             "uxtb16 r6, r4\n\t"
269             "uxtb16 r7, r4, ror #8\n\t"
270
271             /* 255 - alpha */
272             "sub r8, %[alpha_mask], r5, lsr #24\n\t"
273
274             /* multiply by alpha (r8) then by 257 and divide by 65536 */
275             "mla r6, r6, r8, %[component_half]\n\t"
276             "mla r7, r7, r8, %[component_half]\n\t"
277
278             "uxtab16 r6, r6, r6, ror #8\n\t"
279             "uxtab16 r7, r7, r7, ror #8\n\t"
280
281             "uxtb16 r6, r6, ror #8\n\t"
282             "uxtb16 r7, r7, ror #8\n\t"
283
284             /* recombine */
285             "orr r6, r6, r7, lsl #8\n\t"
286
287             "uqadd8 r5, r6, r5\n\t"
288
289 #ifdef inner_branch
290             "3:\n\t"
291
292 #endif
293             "str r5, [%[dest]], #4\n\t"
294             /* increment counter and jmp to top */
295             "subs       %[w], %[w], #1\n\t"
296             "bne        1b\n\t"
297             "2:\n\t"
298             : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
299             : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
300               [alpha_mask] "r" (alpha_mask)
301             : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
302             );
303     }
304 }
305
306 static void
307 arm_composite_over_n_8_8888 (pixman_implementation_t * impl,
308                              pixman_op_t               op,
309                              pixman_image_t *          src_image,
310                              pixman_image_t *          mask_image,
311                              pixman_image_t *          dst_image,
312                              int32_t                   src_x,
313                              int32_t                   src_y,
314                              int32_t                   mask_x,
315                              int32_t                   mask_y,
316                              int32_t                   dest_x,
317                              int32_t                   dest_y,
318                              int32_t                   width,
319                              int32_t                   height)
320 {
321     uint32_t src, srca;
322     uint32_t *dst_line, *dst;
323     uint8_t  *mask_line, *mask;
324     int dst_stride, mask_stride;
325     int32_t w;
326
327     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
328
329     /* bail out if fully transparent */
330     srca = src >> 24;
331     if (src == 0)
332         return;
333
334     uint32_t component_mask = 0xff00ff;
335     uint32_t component_half = 0x800080;
336
337     uint32_t src_hi = (src >> 8) & component_mask;
338     uint32_t src_lo = src & component_mask;
339
340     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
341     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
342
343     while (height--)
344     {
345         dst = dst_line;
346         dst_line += dst_stride;
347         mask = mask_line;
348         mask_line += mask_stride;
349         w = width;
350
351 /* #define inner_branch */
352         asm volatile (
353             "cmp %[w], #0\n\t"
354             "beq 2f\n\t"
355             "1:\n\t"
356             /* load mask */
357             "ldrb r5, [%[mask]], #1\n\t"
358 #ifdef inner_branch
359             /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
360              * The 0x0 case also allows us to avoid doing an unecessary data
361              * write which is more valuable so we only check for that
362              */
363             "cmp r5, #0\n\t"
364             "beq 3f\n\t"
365
366 #endif
367             "ldr r4, [%[dest]] \n\t"
368
369             /* multiply by alpha (r8) then by 257 and divide by 65536 */
370             "mla r6, %[src_lo], r5, %[component_half]\n\t"
371             "mla r7, %[src_hi], r5, %[component_half]\n\t"
372
373             "uxtab16 r6, r6, r6, ror #8\n\t"
374             "uxtab16 r7, r7, r7, ror #8\n\t"
375
376             "uxtb16 r6, r6, ror #8\n\t"
377             "uxtb16 r7, r7, ror #8\n\t"
378
379             /* recombine */
380             "orr r5, r6, r7, lsl #8\n\t"
381
382             "uxtb16 r6, r4\n\t"
383             "uxtb16 r7, r4, ror #8\n\t"
384
385             /* we could simplify this to use 'sub' if we were
386              * willing to give up a register for alpha_mask
387              */
388             "mvn r8, r5\n\t"
389             "mov r8, r8, lsr #24\n\t"
390
391             /* multiply by alpha (r8) then by 257 and divide by 65536 */
392             "mla r6, r6, r8, %[component_half]\n\t"
393             "mla r7, r7, r8, %[component_half]\n\t"
394
395             "uxtab16 r6, r6, r6, ror #8\n\t"
396             "uxtab16 r7, r7, r7, ror #8\n\t"
397
398             "uxtb16 r6, r6, ror #8\n\t"
399             "uxtb16 r7, r7, ror #8\n\t"
400
401             /* recombine */
402             "orr r6, r6, r7, lsl #8\n\t"
403
404             "uqadd8 r5, r6, r5\n\t"
405
406 #ifdef inner_branch
407             "3:\n\t"
408
409 #endif
410             "str r5, [%[dest]], #4\n\t"
411             /* increment counter and jmp to top */
412             "subs       %[w], %[w], #1\n\t"
413             "bne        1b\n\t"
414             "2:\n\t"
415             : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
416             : [component_half] "r" (component_half),
417               [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
418             : "r4", "r5", "r6", "r7", "r8", "cc", "memory");
419     }
420 }
421
422 static const pixman_fast_path_t arm_simd_fast_paths[] =
423 {
424     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, arm_composite_over_8888_8888),
425     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, arm_composite_over_8888_8888),
426     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, arm_composite_over_8888_8888),
427     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, arm_composite_over_8888_8888),
428     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, arm_composite_over_8888_n_8888),
429     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, arm_composite_over_8888_n_8888),
430
431     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, arm_composite_add_8000_8000),
432
433     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, arm_composite_over_n_8_8888),
434     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, arm_composite_over_n_8_8888),
435     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, arm_composite_over_n_8_8888),
436     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, arm_composite_over_n_8_8888),
437
438     { PIXMAN_OP_NONE },
439 };
440
441 pixman_implementation_t *
442 _pixman_implementation_create_arm_simd (void)
443 {
444     pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
445     pixman_implementation_t *imp = _pixman_implementation_create (general, arm_simd_fast_paths);
446
447     return imp;
448 }