2 * Copyright © 2008 Mozilla Corporation
4 * Permission to use, copy, modify, distribute, and sell this software and its
5 * documentation for any purpose is hereby granted without fee, provided that
6 * the above copyright notice appear in all copies and that both that
7 * copyright notice and this permission notice appear in supporting
8 * documentation, and that the name of Mozilla Corporation not be used in
9 * advertising or publicity pertaining to distribution of the software without
10 * specific, written prior permission. Mozilla Corporation makes no
11 * representations about the suitability of this software for any purpose. It
12 * is provided "as is" without express or implied warranty.
14 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
15 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
16 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
17 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
19 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
20 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23 * Author: Jeff Muizelaar (jeff@infidigm.net)
30 #include "pixman-arm-simd.h"
33 fbCompositeSrcAdd_8000x8000arm (pixman_op_t op,
34 pixman_image_t * pSrc,
35 pixman_image_t * pMask,
36 pixman_image_t * pDst,
46 uint8_t *dstLine, *dst;
47 uint8_t *srcLine, *src;
48 int dstStride, srcStride;
52 fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
53 fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
63 /* ensure both src and dst are properly aligned before doing 32 bit reads
64 * we'll stay in this loop if src and dst have differing alignments */
65 while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
69 asm("uqadd8 %0, %1, %2" : "+r"(d) : "r"(s));
79 asm("uqadd8 %0, %1, %2" : "=r"(*(uint32_t*)dst) : "r"(*(uint32_t*)src), "r"(*(uint32_t*)dst));
89 asm("uqadd8 %0, %1, %2" : "+r"(d) : "r"(s));
101 fbCompositeSrc_8888x8888arm (pixman_op_t op,
102 pixman_image_t * pSrc,
103 pixman_image_t * pMask,
104 pixman_image_t * pDst,
114 uint32_t *dstLine, *dst;
115 uint32_t *srcLine, *src;
116 int dstStride, srcStride;
118 uint32_t component_half = 0x800080;
119 uint32_t upper_component_mask = 0xff00ff00;
120 uint32_t alpha_mask = 0xff;
122 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
123 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
128 dstLine += dstStride;
130 srcLine += srcStride;
133 //#define inner_branch
139 "ldr r5, [%[src]], #4\n\t"
141 /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
142 * The 0x0 case also allows us to avoid doing an unecessary data
143 * write which is more valuable so we only check for that */
148 "sub r8, %[alpha_mask], r5, lsr #24\n\t"
150 "ldr r4, [%[dest]] \n\t"
153 "ldr r4, [%[dest]] \n\t"
156 "sub r8, %[alpha_mask], r5, lsr #24\n\t"
159 "uxtb16 r7, r4, ror #8\n\t"
161 /* multiply by 257 and divide by 65536 */
162 "mla r6, r6, r8, %[component_half]\n\t"
163 "mla r7, r7, r8, %[component_half]\n\t"
165 "uxtab16 r6, r6, r6, ror #8\n\t"
166 "uxtab16 r7, r7, r7, ror #8\n\t"
168 /* recombine the 0xff00ff00 bytes of r6 and r7 */
169 "and r7, r7, %[upper_component_mask]\n\t"
170 "uxtab16 r6, r7, r6, ror #8\n\t"
172 "uqadd8 r5, r6, r5\n\t"
178 "str r5, [%[dest]], #4\n\t"
179 /* increment counter and jmp to top */
180 "subs %[w], %[w], #1\n\t"
183 : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
184 : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
185 [alpha_mask] "r" (alpha_mask)
186 : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
192 fbCompositeSrc_8888x8x8888arm (pixman_op_t op,
193 pixman_image_t * pSrc,
194 pixman_image_t * pMask,
195 pixman_image_t * pDst,
205 uint32_t *dstLine, *dst;
206 uint32_t *srcLine, *src;
208 int dstStride, srcStride;
210 uint32_t component_half = 0x800080;
211 uint32_t alpha_mask = 0xff;
213 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
214 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
216 fbComposeGetSolid (pMask, mask, pDst->bits.format);
222 dstLine += dstStride;
224 srcLine += srcStride;
227 //#define inner_branch
233 "ldr r5, [%[src]], #4\n\t"
235 /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
236 * The 0x0 case also allows us to avoid doing an unecessary data
237 * write which is more valuable so we only check for that */
242 "ldr r4, [%[dest]] \n\t"
245 "uxtb16 r7, r5, ror #8\n\t"
247 /* multiply by alpha (r8) then by 257 and divide by 65536 */
248 "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
249 "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
251 "uxtab16 r6, r6, r6, ror #8\n\t"
252 "uxtab16 r7, r7, r7, ror #8\n\t"
254 "uxtb16 r6, r6, ror #8\n\t"
255 "uxtb16 r7, r7, ror #8\n\t"
258 "orr r5, r6, r7, lsl #8\n\t"
261 "uxtb16 r7, r4, ror #8\n\t"
264 "sub r8, %[alpha_mask], r5, lsr #24\n\t"
266 /* multiply by alpha (r8) then by 257 and divide by 65536 */
267 "mla r6, r6, r8, %[component_half]\n\t"
268 "mla r7, r7, r8, %[component_half]\n\t"
270 "uxtab16 r6, r6, r6, ror #8\n\t"
271 "uxtab16 r7, r7, r7, ror #8\n\t"
273 "uxtb16 r6, r6, ror #8\n\t"
274 "uxtb16 r7, r7, ror #8\n\t"
277 "orr r6, r6, r7, lsl #8\n\t"
279 "uqadd8 r5, r6, r5\n\t"
285 "str r5, [%[dest]], #4\n\t"
286 /* increment counter and jmp to top */
287 "subs %[w], %[w], #1\n\t"
290 : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
291 : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
292 [alpha_mask] "r" (alpha_mask)
293 : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
299 fbCompositeSolidMask_nx8x8888arm (pixman_op_t op,
300 pixman_image_t * pSrc,
301 pixman_image_t * pMask,
302 pixman_image_t * pDst,
313 uint32_t *dstLine, *dst;
314 uint8_t *maskLine, *mask;
315 int dstStride, maskStride;
318 fbComposeGetSolid(pSrc, src, pDst->bits.format);
324 uint32_t component_mask = 0xff00ff;
325 uint32_t component_half = 0x800080;
327 uint32_t src_hi = (src >> 8) & component_mask;
328 uint32_t src_lo = src & component_mask;
330 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
331 fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
336 dstLine += dstStride;
338 maskLine += maskStride;
341 //#define inner_branch
347 "ldrb r5, [%[mask]], #1\n\t"
349 /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
350 * The 0x0 case also allows us to avoid doing an unecessary data
351 * write which is more valuable so we only check for that */
356 "ldr r4, [%[dest]] \n\t"
358 /* multiply by alpha (r8) then by 257 and divide by 65536 */
359 "mla r6, %[src_lo], r5, %[component_half]\n\t"
360 "mla r7, %[src_hi], r5, %[component_half]\n\t"
362 "uxtab16 r6, r6, r6, ror #8\n\t"
363 "uxtab16 r7, r7, r7, ror #8\n\t"
365 "uxtb16 r6, r6, ror #8\n\t"
366 "uxtb16 r7, r7, ror #8\n\t"
369 "orr r5, r6, r7, lsl #8\n\t"
372 "uxtb16 r7, r4, ror #8\n\t"
374 /* we could simplify this to use 'sub' if we were
375 * willing to give up a register for alpha_mask */
377 "mov r8, r8, lsr #24\n\t"
379 /* multiply by alpha (r8) then by 257 and divide by 65536 */
380 "mla r6, r6, r8, %[component_half]\n\t"
381 "mla r7, r7, r8, %[component_half]\n\t"
383 "uxtab16 r6, r6, r6, ror #8\n\t"
384 "uxtab16 r7, r7, r7, ror #8\n\t"
386 "uxtb16 r6, r6, ror #8\n\t"
387 "uxtb16 r7, r7, ror #8\n\t"
390 "orr r6, r6, r7, lsl #8\n\t"
392 "uqadd8 r5, r6, r5\n\t"
398 "str r5, [%[dest]], #4\n\t"
399 /* increment counter and jmp to top */
400 "subs %[w], %[w], #1\n\t"
403 : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
404 : [component_half] "r" (component_half),
405 [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
406 : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
411 static const FastPathInfo arm_simd_fast_path_array[] =
413 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8888arm, 0 },
414 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8888arm, 0 },
415 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, fbCompositeSrc_8888x8888arm, 0 },
416 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, fbCompositeSrc_8888x8888arm, 0 },
417 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8x8888arm, NEED_SOLID_MASK },
418 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8x8888arm, NEED_SOLID_MASK },
420 { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, fbCompositeSrcAdd_8000x8000arm, 0 },
422 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, fbCompositeSolidMask_nx8x8888arm, 0 },
423 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8x8888arm, 0 },
424 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8x8888arm, 0 },
425 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, fbCompositeSolidMask_nx8x8888arm, 0 },
430 const FastPathInfo *const arm_simd_fast_paths = arm_simd_fast_path_array;