2 * Copyright © 2008 Mozilla Corporation
4 * Permission to use, copy, modify, distribute, and sell this software and its
5 * documentation for any purpose is hereby granted without fee, provided that
6 * the above copyright notice appear in all copies and that both that
7 * copyright notice and this permission notice appear in supporting
8 * documentation, and that the name of Mozilla Corporation not be used in
9 * advertising or publicity pertaining to distribution of the software without
10 * specific, written prior permission. Mozilla Corporation makes no
11 * representations about the suitability of this software for any purpose. It
12 * is provided "as is" without express or implied warranty.
14 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
15 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
16 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
17 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
19 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
20 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23 * Author: Jeff Muizelaar (jeff@infidigm.net)
30 #include "pixman-arm-simd.h"
33 fbCompositeSrcAdd_8000x8000arm (
34 pixman_implementation_t * impl,
36 pixman_image_t * pSrc,
37 pixman_image_t * pMask,
38 pixman_image_t * pDst,
48 uint8_t *dstLine, *dst;
49 uint8_t *srcLine, *src;
50 int dstStride, srcStride;
54 fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
55 fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
65 /* ensure both src and dst are properly aligned before doing 32 bit reads
66 * we'll stay in this loop if src and dst have differing alignments */
67 while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
71 asm("uqadd8 %0, %1, %2" : "+r"(d) : "r"(s));
81 asm("uqadd8 %0, %1, %2" : "=r"(*(uint32_t*)dst) : "r"(*(uint32_t*)src), "r"(*(uint32_t*)dst));
91 asm("uqadd8 %0, %1, %2" : "+r"(d) : "r"(s));
103 fbCompositeSrc_8888x8888arm (
104 pixman_implementation_t * impl,
106 pixman_image_t * pSrc,
107 pixman_image_t * pMask,
108 pixman_image_t * pDst,
118 uint32_t *dstLine, *dst;
119 uint32_t *srcLine, *src;
120 int dstStride, srcStride;
122 uint32_t component_half = 0x800080;
123 uint32_t upper_component_mask = 0xff00ff00;
124 uint32_t alpha_mask = 0xff;
126 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
127 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
132 dstLine += dstStride;
134 srcLine += srcStride;
137 //#define inner_branch
143 "ldr r5, [%[src]], #4\n\t"
145 /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
146 * The 0x0 case also allows us to avoid doing an unecessary data
147 * write which is more valuable so we only check for that */
152 "sub r8, %[alpha_mask], r5, lsr #24\n\t"
154 "ldr r4, [%[dest]] \n\t"
157 "ldr r4, [%[dest]] \n\t"
160 "sub r8, %[alpha_mask], r5, lsr #24\n\t"
163 "uxtb16 r7, r4, ror #8\n\t"
165 /* multiply by 257 and divide by 65536 */
166 "mla r6, r6, r8, %[component_half]\n\t"
167 "mla r7, r7, r8, %[component_half]\n\t"
169 "uxtab16 r6, r6, r6, ror #8\n\t"
170 "uxtab16 r7, r7, r7, ror #8\n\t"
172 /* recombine the 0xff00ff00 bytes of r6 and r7 */
173 "and r7, r7, %[upper_component_mask]\n\t"
174 "uxtab16 r6, r7, r6, ror #8\n\t"
176 "uqadd8 r5, r6, r5\n\t"
182 "str r5, [%[dest]], #4\n\t"
183 /* increment counter and jmp to top */
184 "subs %[w], %[w], #1\n\t"
187 : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
188 : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
189 [alpha_mask] "r" (alpha_mask)
190 : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
196 fbCompositeSrc_8888x8x8888arm (
197 pixman_implementation_t * impl,
199 pixman_image_t * pSrc,
200 pixman_image_t * pMask,
201 pixman_image_t * pDst,
211 uint32_t *dstLine, *dst;
212 uint32_t *srcLine, *src;
214 int dstStride, srcStride;
216 uint32_t component_half = 0x800080;
217 uint32_t alpha_mask = 0xff;
219 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
220 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
222 _pixman_image_get_solid (pMask, mask, pDst->bits.format);
228 dstLine += dstStride;
230 srcLine += srcStride;
233 //#define inner_branch
239 "ldr r5, [%[src]], #4\n\t"
241 /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
242 * The 0x0 case also allows us to avoid doing an unecessary data
243 * write which is more valuable so we only check for that */
248 "ldr r4, [%[dest]] \n\t"
251 "uxtb16 r7, r5, ror #8\n\t"
253 /* multiply by alpha (r8) then by 257 and divide by 65536 */
254 "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
255 "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
257 "uxtab16 r6, r6, r6, ror #8\n\t"
258 "uxtab16 r7, r7, r7, ror #8\n\t"
260 "uxtb16 r6, r6, ror #8\n\t"
261 "uxtb16 r7, r7, ror #8\n\t"
264 "orr r5, r6, r7, lsl #8\n\t"
267 "uxtb16 r7, r4, ror #8\n\t"
270 "sub r8, %[alpha_mask], r5, lsr #24\n\t"
272 /* multiply by alpha (r8) then by 257 and divide by 65536 */
273 "mla r6, r6, r8, %[component_half]\n\t"
274 "mla r7, r7, r8, %[component_half]\n\t"
276 "uxtab16 r6, r6, r6, ror #8\n\t"
277 "uxtab16 r7, r7, r7, ror #8\n\t"
279 "uxtb16 r6, r6, ror #8\n\t"
280 "uxtb16 r7, r7, ror #8\n\t"
283 "orr r6, r6, r7, lsl #8\n\t"
285 "uqadd8 r5, r6, r5\n\t"
291 "str r5, [%[dest]], #4\n\t"
292 /* increment counter and jmp to top */
293 "subs %[w], %[w], #1\n\t"
296 : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
297 : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
298 [alpha_mask] "r" (alpha_mask)
299 : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
305 fbCompositeSolidMask_nx8x8888arm (
306 pixman_implementation_t * impl,
308 pixman_image_t * pSrc,
309 pixman_image_t * pMask,
310 pixman_image_t * pDst,
321 uint32_t *dstLine, *dst;
322 uint8_t *maskLine, *mask;
323 int dstStride, maskStride;
326 _pixman_image_get_solid(pSrc, src, pDst->bits.format);
332 uint32_t component_mask = 0xff00ff;
333 uint32_t component_half = 0x800080;
335 uint32_t src_hi = (src >> 8) & component_mask;
336 uint32_t src_lo = src & component_mask;
338 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
339 fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
344 dstLine += dstStride;
346 maskLine += maskStride;
349 //#define inner_branch
355 "ldrb r5, [%[mask]], #1\n\t"
357 /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
358 * The 0x0 case also allows us to avoid doing an unecessary data
359 * write which is more valuable so we only check for that */
364 "ldr r4, [%[dest]] \n\t"
366 /* multiply by alpha (r8) then by 257 and divide by 65536 */
367 "mla r6, %[src_lo], r5, %[component_half]\n\t"
368 "mla r7, %[src_hi], r5, %[component_half]\n\t"
370 "uxtab16 r6, r6, r6, ror #8\n\t"
371 "uxtab16 r7, r7, r7, ror #8\n\t"
373 "uxtb16 r6, r6, ror #8\n\t"
374 "uxtb16 r7, r7, ror #8\n\t"
377 "orr r5, r6, r7, lsl #8\n\t"
380 "uxtb16 r7, r4, ror #8\n\t"
382 /* we could simplify this to use 'sub' if we were
383 * willing to give up a register for alpha_mask */
385 "mov r8, r8, lsr #24\n\t"
387 /* multiply by alpha (r8) then by 257 and divide by 65536 */
388 "mla r6, r6, r8, %[component_half]\n\t"
389 "mla r7, r7, r8, %[component_half]\n\t"
391 "uxtab16 r6, r6, r6, ror #8\n\t"
392 "uxtab16 r7, r7, r7, ror #8\n\t"
394 "uxtb16 r6, r6, ror #8\n\t"
395 "uxtb16 r7, r7, ror #8\n\t"
398 "orr r6, r6, r7, lsl #8\n\t"
400 "uqadd8 r5, r6, r5\n\t"
406 "str r5, [%[dest]], #4\n\t"
407 /* increment counter and jmp to top */
408 "subs %[w], %[w], #1\n\t"
411 : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
412 : [component_half] "r" (component_half),
413 [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
414 : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
419 static const pixman_fast_path_t arm_simd_fast_path_array[] =
421 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8888arm, 0 },
422 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8888arm, 0 },
423 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, fbCompositeSrc_8888x8888arm, 0 },
424 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, fbCompositeSrc_8888x8888arm, 0 },
425 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8x8888arm, NEED_SOLID_MASK },
426 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8x8888arm, NEED_SOLID_MASK },
428 { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, fbCompositeSrcAdd_8000x8000arm, 0 },
430 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, fbCompositeSolidMask_nx8x8888arm, 0 },
431 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8x8888arm, 0 },
432 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8x8888arm, 0 },
433 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, fbCompositeSolidMask_nx8x8888arm, 0 },
438 const pixman_fast_path_t *const arm_simd_fast_paths = arm_simd_fast_path_array;
441 arm_simd_composite (pixman_implementation_t *imp,
444 pixman_image_t *mask,
445 pixman_image_t *dest,
455 if (_pixman_run_fast_path (arm_simd_fast_paths, imp,
465 _pixman_implementation_composite (imp->delegate, op,
473 pixman_implementation_t *
474 _pixman_implementation_create_arm_simd (void)
476 pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
477 pixman_implementation_t *imp = _pixman_implementation_create (general);
479 imp->composite = arm_simd_composite;