2 * Copyright © 2008 Mozilla Corporation
4 * Permission to use, copy, modify, distribute, and sell this software and its
5 * documentation for any purpose is hereby granted without fee, provided that
6 * the above copyright notice appear in all copies and that both that
7 * copyright notice and this permission notice appear in supporting
8 * documentation, and that the name of Mozilla Corporation not be used in
9 * advertising or publicity pertaining to distribution of the software without
10 * specific, written prior permission. Mozilla Corporation makes no
11 * representations about the suitability of this software for any purpose. It
12 * is provided "as is" without express or implied warranty.
14 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
15 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
16 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
17 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
19 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
20 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23 * Author: Jeff Muizelaar (jeff@infidigm.net)
30 #include "pixman-private.h"
33 arm_CompositeAdd_8000_8000 (
34 pixman_implementation_t * impl,
36 pixman_image_t * pSrc,
37 pixman_image_t * pMask,
38 pixman_image_t * pDst,
48 uint8_t *dstLine, *dst;
49 uint8_t *srcLine, *src;
50 int dstStride, srcStride;
54 fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
55 fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
65 /* ensure both src and dst are properly aligned before doing 32 bit reads
66 * we'll stay in this loop if src and dst have differing alignments */
67 while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
71 asm("uqadd8 %0, %1, %2" : "+r"(d) : "r"(s));
81 asm("uqadd8 %0, %1, %2" : "=r"(*(uint32_t*)dst) : "r"(*(uint32_t*)src), "r"(*(uint32_t*)dst));
91 asm("uqadd8 %0, %1, %2" : "+r"(d) : "r"(s));
103 arm_composite_over_8888_8888 (
104 pixman_implementation_t * impl,
106 pixman_image_t * pSrc,
107 pixman_image_t * pMask,
108 pixman_image_t * pDst,
118 uint32_t *dstLine, *dst;
119 uint32_t *srcLine, *src;
120 int dstStride, srcStride;
122 uint32_t component_half = 0x800080;
123 uint32_t upper_component_mask = 0xff00ff00;
124 uint32_t alpha_mask = 0xff;
126 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
127 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
132 dstLine += dstStride;
134 srcLine += srcStride;
137 //#define inner_branch
143 "ldr r5, [%[src]], #4\n\t"
145 /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
146 * The 0x0 case also allows us to avoid doing an unecessary data
147 * write which is more valuable so we only check for that */
152 "sub r8, %[alpha_mask], r5, lsr #24\n\t"
154 "ldr r4, [%[dest]] \n\t"
157 "ldr r4, [%[dest]] \n\t"
160 "sub r8, %[alpha_mask], r5, lsr #24\n\t"
163 "uxtb16 r7, r4, ror #8\n\t"
165 /* multiply by 257 and divide by 65536 */
166 "mla r6, r6, r8, %[component_half]\n\t"
167 "mla r7, r7, r8, %[component_half]\n\t"
169 "uxtab16 r6, r6, r6, ror #8\n\t"
170 "uxtab16 r7, r7, r7, ror #8\n\t"
172 /* recombine the 0xff00ff00 bytes of r6 and r7 */
173 "and r7, r7, %[upper_component_mask]\n\t"
174 "uxtab16 r6, r7, r6, ror #8\n\t"
176 "uqadd8 r5, r6, r5\n\t"
182 "str r5, [%[dest]], #4\n\t"
183 /* increment counter and jmp to top */
184 "subs %[w], %[w], #1\n\t"
187 : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
188 : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
189 [alpha_mask] "r" (alpha_mask)
190 : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
196 arm_composite_over_8888_n_8888 (
197 pixman_implementation_t * impl,
199 pixman_image_t * pSrc,
200 pixman_image_t * pMask,
201 pixman_image_t * pDst,
211 uint32_t *dstLine, *dst;
212 uint32_t *srcLine, *src;
214 int dstStride, srcStride;
216 uint32_t component_half = 0x800080;
217 uint32_t alpha_mask = 0xff;
219 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
220 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
222 mask = _pixman_image_get_solid (pMask, pDst->bits.format);
228 dstLine += dstStride;
230 srcLine += srcStride;
233 //#define inner_branch
239 "ldr r5, [%[src]], #4\n\t"
241 /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
242 * The 0x0 case also allows us to avoid doing an unecessary data
243 * write which is more valuable so we only check for that */
248 "ldr r4, [%[dest]] \n\t"
251 "uxtb16 r7, r5, ror #8\n\t"
253 /* multiply by alpha (r8) then by 257 and divide by 65536 */
254 "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
255 "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
257 "uxtab16 r6, r6, r6, ror #8\n\t"
258 "uxtab16 r7, r7, r7, ror #8\n\t"
260 "uxtb16 r6, r6, ror #8\n\t"
261 "uxtb16 r7, r7, ror #8\n\t"
264 "orr r5, r6, r7, lsl #8\n\t"
267 "uxtb16 r7, r4, ror #8\n\t"
270 "sub r8, %[alpha_mask], r5, lsr #24\n\t"
272 /* multiply by alpha (r8) then by 257 and divide by 65536 */
273 "mla r6, r6, r8, %[component_half]\n\t"
274 "mla r7, r7, r8, %[component_half]\n\t"
276 "uxtab16 r6, r6, r6, ror #8\n\t"
277 "uxtab16 r7, r7, r7, ror #8\n\t"
279 "uxtb16 r6, r6, ror #8\n\t"
280 "uxtb16 r7, r7, ror #8\n\t"
283 "orr r6, r6, r7, lsl #8\n\t"
285 "uqadd8 r5, r6, r5\n\t"
291 "str r5, [%[dest]], #4\n\t"
292 /* increment counter and jmp to top */
293 "subs %[w], %[w], #1\n\t"
296 : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
297 : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
298 [alpha_mask] "r" (alpha_mask)
299 : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
305 arm_CompositeOver_n_8_8888 (
306 pixman_implementation_t * impl,
308 pixman_image_t * pSrc,
309 pixman_image_t * pMask,
310 pixman_image_t * pDst,
321 uint32_t *dstLine, *dst;
322 uint8_t *maskLine, *mask;
323 int dstStride, maskStride;
326 src = _pixman_image_get_solid(pSrc, pDst->bits.format);
328 // bail out if fully transparent
333 uint32_t component_mask = 0xff00ff;
334 uint32_t component_half = 0x800080;
336 uint32_t src_hi = (src >> 8) & component_mask;
337 uint32_t src_lo = src & component_mask;
339 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
340 fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
345 dstLine += dstStride;
347 maskLine += maskStride;
350 //#define inner_branch
356 "ldrb r5, [%[mask]], #1\n\t"
358 /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
359 * The 0x0 case also allows us to avoid doing an unecessary data
360 * write which is more valuable so we only check for that */
365 "ldr r4, [%[dest]] \n\t"
367 /* multiply by alpha (r8) then by 257 and divide by 65536 */
368 "mla r6, %[src_lo], r5, %[component_half]\n\t"
369 "mla r7, %[src_hi], r5, %[component_half]\n\t"
371 "uxtab16 r6, r6, r6, ror #8\n\t"
372 "uxtab16 r7, r7, r7, ror #8\n\t"
374 "uxtb16 r6, r6, ror #8\n\t"
375 "uxtb16 r7, r7, ror #8\n\t"
378 "orr r5, r6, r7, lsl #8\n\t"
381 "uxtb16 r7, r4, ror #8\n\t"
383 /* we could simplify this to use 'sub' if we were
384 * willing to give up a register for alpha_mask */
386 "mov r8, r8, lsr #24\n\t"
388 /* multiply by alpha (r8) then by 257 and divide by 65536 */
389 "mla r6, r6, r8, %[component_half]\n\t"
390 "mla r7, r7, r8, %[component_half]\n\t"
392 "uxtab16 r6, r6, r6, ror #8\n\t"
393 "uxtab16 r7, r7, r7, ror #8\n\t"
395 "uxtb16 r6, r6, ror #8\n\t"
396 "uxtb16 r7, r7, ror #8\n\t"
399 "orr r6, r6, r7, lsl #8\n\t"
401 "uqadd8 r5, r6, r5\n\t"
407 "str r5, [%[dest]], #4\n\t"
408 /* increment counter and jmp to top */
409 "subs %[w], %[w], #1\n\t"
412 : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
413 : [component_half] "r" (component_half),
414 [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
415 : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
420 static const pixman_fast_path_t arm_simd_fast_path_array[] =
422 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, arm_composite_over_8888_8888, 0 },
423 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, arm_composite_over_8888_8888, 0 },
424 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, arm_composite_over_8888_8888, 0 },
425 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, arm_composite_over_8888_8888, 0 },
426 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, arm_composite_over_8888_n_8888, NEED_SOLID_MASK },
427 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, arm_composite_over_8888_n_8888, NEED_SOLID_MASK },
429 { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, arm_CompositeAdd_8000_8000, 0 },
431 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, arm_CompositeOver_n_8_8888, 0 },
432 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, arm_CompositeOver_n_8_8888, 0 },
433 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, arm_CompositeOver_n_8_8888, 0 },
434 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, arm_CompositeOver_n_8_8888, 0 },
439 const pixman_fast_path_t *const arm_simd_fast_paths = arm_simd_fast_path_array;
442 arm_simd_composite (pixman_implementation_t *imp,
445 pixman_image_t *mask,
446 pixman_image_t *dest,
456 if (_pixman_run_fast_path (arm_simd_fast_paths, imp,
466 _pixman_implementation_composite (imp->delegate, op,
474 pixman_implementation_t *
475 _pixman_implementation_create_arm_simd (void)
477 pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
478 pixman_implementation_t *imp = _pixman_implementation_create (general);
480 imp->composite = arm_simd_composite;