Check alignment of 'src' pointer in optimized ARM routines
[profile/ivi/pixman.git] / pixman / pixman-arm-simd.c
1 /*
2  * Copyright © 2008 Mozilla Corporation
3  *
4  * Permission to use, copy, modify, distribute, and sell this software and its
5  * documentation for any purpose is hereby granted without fee, provided that
6  * the above copyright notice appear in all copies and that both that
7  * copyright notice and this permission notice appear in supporting
8  * documentation, and that the name of Mozilla Corporation not be used in
9  * advertising or publicity pertaining to distribution of the software without
10  * specific, written prior permission.  Mozilla Corporation makes no
11  * representations about the suitability of this software for any purpose.  It
12  * is provided "as is" without express or implied warranty.
13  *
14  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
15  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
16  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
17  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
19  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
20  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
21  * SOFTWARE.
22  *
23  * Author:  Jeff Muizelaar (jeff@infidigm.net)
24  *
25  */
26 #ifdef HAVE_CONFIG_H
27 #include <config.h>
28 #endif
29
30 #include "pixman-arm-simd.h"
31
32 void
33 fbCompositeSrcAdd_8000x8000arm (pixman_op_t op,
34                                 pixman_image_t * pSrc,
35                                 pixman_image_t * pMask,
36                                 pixman_image_t * pDst,
37                                 int16_t      xSrc,
38                                 int16_t      ySrc,
39                                 int16_t      xMask,
40                                 int16_t      yMask,
41                                 int16_t      xDst,
42                                 int16_t      yDst,
43                                 uint16_t     width,
44                                 uint16_t     height)
45 {
46     uint8_t     *dstLine, *dst;
47     uint8_t     *srcLine, *src;
48     int dstStride, srcStride;
49     uint16_t    w;
50     uint8_t     s, d;
51
52     fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
53     fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
54
55     while (height--)
56     {
57         dst = dstLine;
58         dstLine += dstStride;
59         src = srcLine;
60         srcLine += srcStride;
61         w = width;
62
63         while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
64         {
65             s = *src;
66             d = *dst;
67             asm("uqadd8 %0, %1, %2" : "+r"(d) : "r"(s));
68             *dst = d;
69
70             dst++;
71             src++;
72             w--;
73         }
74
75         while (w >= 4)
76         {
77             asm("uqadd8 %0, %1, %2" : "=r"(*(uint32_t*)dst) : "r"(*(uint32_t*)src), "r"(*(uint32_t*)dst));
78             dst += 4;
79             src += 4;
80             w -= 4;
81         }
82
83         while (w)
84         {
85             s = *src;
86             d = *dst;
87             asm("uqadd8 %0, %1, %2" : "+r"(d) : "r"(s));
88             *dst = d;
89
90             dst++;
91             src++;
92             w--;
93         }
94     }
95
96 }
97
98 void
99 fbCompositeSrc_8888x8888arm (pixman_op_t op,
100                          pixman_image_t * pSrc,
101                          pixman_image_t * pMask,
102                          pixman_image_t * pDst,
103                          int16_t      xSrc,
104                          int16_t      ySrc,
105                          int16_t      xMask,
106                          int16_t      yMask,
107                          int16_t      xDst,
108                          int16_t      yDst,
109                          uint16_t     width,
110                          uint16_t     height)
111 {
112     uint32_t    *dstLine, *dst;
113     uint32_t    *srcLine, *src;
114     int dstStride, srcStride;
115     uint16_t    w;
116     uint32_t component_half = 0x800080;
117     uint32_t upper_component_mask = 0xff00ff00;
118     uint32_t alpha_mask = 0xff;
119
120     fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
121     fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
122
123     while (height--)
124     {
125         dst = dstLine;
126         dstLine += dstStride;
127         src = srcLine;
128         srcLine += srcStride;
129         w = width;
130
131 //#define inner_branch
132         asm volatile (
133                         "cmp %[w], #0\n\t"
134                         "beq 2f\n\t"
135                         "1:\n\t"
136                         /* load src */
137                         "ldr r5, [%[src]], #4\n\t"
138 #ifdef inner_branch
139                         /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
140                          * The 0x0 case also allows us to avoid doing an unecessary data
141                          * write which is more valuable so we only check for that */
142                         "cmp r5, #0\n\t"
143                         "beq 3f\n\t"
144
145                         /* = 255 - alpha */
146                         "sub r8, %[alpha_mask], r5, lsr #24\n\t"
147
148                         "ldr r4, [%[dest]] \n\t"
149
150 #else
151                         "ldr r4, [%[dest]] \n\t"
152
153                         /* = 255 - alpha */
154                         "sub r8, %[alpha_mask], r5, lsr #24\n\t"
155 #endif
156                         "uxtb16 r6, r4\n\t"
157                         "uxtb16 r7, r4, ror #8\n\t"
158
159                         /* multiply by 257 and divide by 65536 */
160                         "mla r6, r6, r8, %[component_half]\n\t"
161                         "mla r7, r7, r8, %[component_half]\n\t"
162
163                         "uxtab16 r6, r6, r6, ror #8\n\t"
164                         "uxtab16 r7, r7, r7, ror #8\n\t"
165
166                         /* recombine the 0xff00ff00 bytes of r6 and r7 */
167                         "and r7, r7, %[upper_component_mask]\n\t"
168                         "uxtab16 r6, r7, r6, ror #8\n\t"
169
170                         "uqadd8 r5, r6, r5\n\t"
171
172 #ifdef inner_branch
173                         "3:\n\t"
174
175 #endif
176                         "str r5, [%[dest]], #4\n\t"
177                         /* increment counter and jmp to top */
178                         "subs   %[w], %[w], #1\n\t"
179                         "bne    1b\n\t"
180                         "2:\n\t"
181                         : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
182                         : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
183                           [alpha_mask] "r" (alpha_mask)
184                         : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
185                         );
186     }
187 }
188
189 void
190 fbCompositeSrc_8888x8x8888arm (pixman_op_t op,
191                                pixman_image_t * pSrc,
192                                pixman_image_t * pMask,
193                                pixman_image_t * pDst,
194                                int16_t  xSrc,
195                                int16_t  ySrc,
196                                int16_t      xMask,
197                                int16_t      yMask,
198                                int16_t      xDst,
199                                int16_t      yDst,
200                                uint16_t     width,
201                                uint16_t     height)
202 {
203     uint32_t    *dstLine, *dst;
204     uint32_t    *srcLine, *src;
205     uint32_t    mask;
206     int dstStride, srcStride;
207     uint16_t    w;
208     uint32_t component_half = 0x800080;
209     uint32_t alpha_mask = 0xff;
210
211     fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
212     fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
213
214     fbComposeGetSolid (pMask, mask, pDst->bits.format);
215     mask = (mask) >> 24;
216
217     while (height--)
218     {
219         dst = dstLine;
220         dstLine += dstStride;
221         src = srcLine;
222         srcLine += srcStride;
223         w = width;
224
225 //#define inner_branch
226         asm volatile (
227                         "cmp %[w], #0\n\t"
228                         "beq 2f\n\t"
229                         "1:\n\t"
230                         /* load src */
231                         "ldr r5, [%[src]], #4\n\t"
232 #ifdef inner_branch
233                         /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
234                          * The 0x0 case also allows us to avoid doing an unecessary data
235                          * write which is more valuable so we only check for that */
236                         "cmp r5, #0\n\t"
237                         "beq 3f\n\t"
238
239 #endif
240                         "ldr r4, [%[dest]] \n\t"
241
242                         "uxtb16 r6, r5\n\t"
243                         "uxtb16 r7, r5, ror #8\n\t"
244
245                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
246                         "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
247                         "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
248
249                         "uxtab16 r6, r6, r6, ror #8\n\t"
250                         "uxtab16 r7, r7, r7, ror #8\n\t"
251
252                         "uxtb16 r6, r6, ror #8\n\t"
253                         "uxtb16 r7, r7, ror #8\n\t"
254
255                         /* recombine */
256                         "orr r5, r6, r7, lsl #8\n\t"
257
258                         "uxtb16 r6, r4\n\t"
259                         "uxtb16 r7, r4, ror #8\n\t"
260
261                         /* 255 - alpha */
262                         "sub r8, %[alpha_mask], r5, lsr #24\n\t"
263
264                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
265                         "mla r6, r6, r8, %[component_half]\n\t"
266                         "mla r7, r7, r8, %[component_half]\n\t"
267
268                         "uxtab16 r6, r6, r6, ror #8\n\t"
269                         "uxtab16 r7, r7, r7, ror #8\n\t"
270
271                         "uxtb16 r6, r6, ror #8\n\t"
272                         "uxtb16 r7, r7, ror #8\n\t"
273
274                         /* recombine */
275                         "orr r6, r6, r7, lsl #8\n\t"
276
277                         "uqadd8 r5, r6, r5\n\t"
278
279 #ifdef inner_branch
280                         "3:\n\t"
281
282 #endif
283                         "str r5, [%[dest]], #4\n\t"
284                         /* increment counter and jmp to top */
285                         "subs   %[w], %[w], #1\n\t"
286                         "bne    1b\n\t"
287                         "2:\n\t"
288                         : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
289                         : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
290                           [alpha_mask] "r" (alpha_mask)
291                         : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
292                         );
293     }
294 }
295
296 void
297 fbCompositeSolidMask_nx8x8888arm (pixman_op_t      op,
298                                pixman_image_t * pSrc,
299                                pixman_image_t * pMask,
300                                pixman_image_t * pDst,
301                                int16_t      xSrc,
302                                int16_t      ySrc,
303                                int16_t      xMask,
304                                int16_t      yMask,
305                                int16_t      xDst,
306                                int16_t      yDst,
307                                uint16_t     width,
308                                uint16_t     height)
309 {
310     uint32_t     src, srca;
311     uint32_t    *dstLine, *dst;
312     uint8_t     *maskLine, *mask;
313     int          dstStride, maskStride;
314     uint16_t     w;
315
316     fbComposeGetSolid(pSrc, src, pDst->bits.format);
317
318     srca = src >> 24;
319     if (src == 0)
320         return;
321
322     uint32_t component_mask = 0xff00ff;
323     uint32_t component_half = 0x800080;
324
325     uint32_t src_hi = (src >> 8) & component_mask;
326     uint32_t src_lo = src & component_mask;
327
328     fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
329     fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
330
331     while (height--)
332     {
333         dst = dstLine;
334         dstLine += dstStride;
335         mask = maskLine;
336         maskLine += maskStride;
337         w = width;
338
339 //#define inner_branch
340         asm volatile (
341                         "cmp %[w], #0\n\t"
342                         "beq 2f\n\t"
343                         "1:\n\t"
344                         /* load mask */
345                         "ldrb r5, [%[mask]], #1\n\t"
346 #ifdef inner_branch
347                         /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
348                          * The 0x0 case also allows us to avoid doing an unecessary data
349                          * write which is more valuable so we only check for that */
350                         "cmp r5, #0\n\t"
351                         "beq 3f\n\t"
352
353 #endif
354                         "ldr r4, [%[dest]] \n\t"
355
356                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
357                         "mla r6, %[src_lo], r5, %[component_half]\n\t"
358                         "mla r7, %[src_hi], r5, %[component_half]\n\t"
359
360                         "uxtab16 r6, r6, r6, ror #8\n\t"
361                         "uxtab16 r7, r7, r7, ror #8\n\t"
362
363                         "uxtb16 r6, r6, ror #8\n\t"
364                         "uxtb16 r7, r7, ror #8\n\t"
365
366                         /* recombine */
367                         "orr r5, r6, r7, lsl #8\n\t"
368
369                         "uxtb16 r6, r4\n\t"
370                         "uxtb16 r7, r4, ror #8\n\t"
371
372                         /* we could simplify this to use 'sub' if we were
373                          * willing to give up a register for alpha_mask */
374                         "mvn r8, r5\n\t"
375                         "mov r8, r8, lsr #24\n\t"
376
377                         /* multiply by alpha (r8) then by 257 and divide by 65536 */
378                         "mla r6, r6, r8, %[component_half]\n\t"
379                         "mla r7, r7, r8, %[component_half]\n\t"
380
381                         "uxtab16 r6, r6, r6, ror #8\n\t"
382                         "uxtab16 r7, r7, r7, ror #8\n\t"
383
384                         "uxtb16 r6, r6, ror #8\n\t"
385                         "uxtb16 r7, r7, ror #8\n\t"
386
387                         /* recombine */
388                         "orr r6, r6, r7, lsl #8\n\t"
389
390                         "uqadd8 r5, r6, r5\n\t"
391
392 #ifdef inner_branch
393                         "3:\n\t"
394
395 #endif
396                         "str r5, [%[dest]], #4\n\t"
397                         /* increment counter and jmp to top */
398                         "subs   %[w], %[w], #1\n\t"
399                         "bne    1b\n\t"
400                         "2:\n\t"
401                         : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
402                         : [component_half] "r" (component_half),
403                           [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
404                         : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
405                         );
406     }
407 }