2 * Copyright © 2008 Mozilla Corporation
4 * Permission to use, copy, modify, distribute, and sell this software and its
5 * documentation for any purpose is hereby granted without fee, provided that
6 * the above copyright notice appear in all copies and that both that
7 * copyright notice and this permission notice appear in supporting
8 * documentation, and that the name of Mozilla Corporation not be used in
9 * advertising or publicity pertaining to distribution of the software without
10 * specific, written prior permission. Mozilla Corporation makes no
11 * representations about the suitability of this software for any purpose. It
12 * is provided "as is" without express or implied warranty.
14 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
15 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
16 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
17 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
19 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
20 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23 * Author: Jeff Muizelaar (jeff@infidigm.net)
30 #include "pixman-private.h"
33 arm_composite_add_8000_8000 (pixman_implementation_t * impl,
35 pixman_image_t * src_image,
36 pixman_image_t * mask_image,
37 pixman_image_t * dst_image,
47 uint8_t *dst_line, *dst;
48 uint8_t *src_line, *src;
49 int dst_stride, src_stride;
53 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
54 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
59 dst_line += dst_stride;
61 src_line += src_stride;
64 /* ensure both src and dst are properly aligned before doing 32 bit reads
65 * we'll stay in this loop if src and dst have differing alignments
67 while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
71 asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
81 asm ("uqadd8 %0, %1, %2"
82 : "=r" (*(uint32_t*)dst)
83 : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst));
93 asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
105 arm_composite_over_8888_8888 (pixman_implementation_t * impl,
107 pixman_image_t * src_image,
108 pixman_image_t * mask_image,
109 pixman_image_t * dst_image,
119 uint32_t *dst_line, *dst;
120 uint32_t *src_line, *src;
121 int dst_stride, src_stride;
123 uint32_t component_half = 0x800080;
124 uint32_t upper_component_mask = 0xff00ff00;
125 uint32_t alpha_mask = 0xff;
127 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
128 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
133 dst_line += dst_stride;
135 src_line += src_stride;
138 /* #define inner_branch */
144 "ldr r5, [%[src]], #4\n\t"
146 /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
147 * The 0x0 case also allows us to avoid doing an unecessary data
148 * write which is more valuable so we only check for that
154 "sub r8, %[alpha_mask], r5, lsr #24\n\t"
156 "ldr r4, [%[dest]] \n\t"
159 "ldr r4, [%[dest]] \n\t"
162 "sub r8, %[alpha_mask], r5, lsr #24\n\t"
165 "uxtb16 r7, r4, ror #8\n\t"
167 /* multiply by 257 and divide by 65536 */
168 "mla r6, r6, r8, %[component_half]\n\t"
169 "mla r7, r7, r8, %[component_half]\n\t"
171 "uxtab16 r6, r6, r6, ror #8\n\t"
172 "uxtab16 r7, r7, r7, ror #8\n\t"
174 /* recombine the 0xff00ff00 bytes of r6 and r7 */
175 "and r7, r7, %[upper_component_mask]\n\t"
176 "uxtab16 r6, r7, r6, ror #8\n\t"
178 "uqadd8 r5, r6, r5\n\t"
184 "str r5, [%[dest]], #4\n\t"
185 /* increment counter and jmp to top */
186 "subs %[w], %[w], #1\n\t"
189 : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
190 : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
191 [alpha_mask] "r" (alpha_mask)
192 : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
198 arm_composite_over_8888_n_8888 (pixman_implementation_t * impl,
200 pixman_image_t * src_image,
201 pixman_image_t * mask_image,
202 pixman_image_t * dst_image,
212 uint32_t *dst_line, *dst;
213 uint32_t *src_line, *src;
215 int dst_stride, src_stride;
217 uint32_t component_half = 0x800080;
218 uint32_t alpha_mask = 0xff;
220 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
221 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
223 mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
229 dst_line += dst_stride;
231 src_line += src_stride;
234 /* #define inner_branch */
240 "ldr r5, [%[src]], #4\n\t"
242 /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
243 * The 0x0 case also allows us to avoid doing an unecessary data
244 * write which is more valuable so we only check for that
250 "ldr r4, [%[dest]] \n\t"
253 "uxtb16 r7, r5, ror #8\n\t"
255 /* multiply by alpha (r8) then by 257 and divide by 65536 */
256 "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
257 "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
259 "uxtab16 r6, r6, r6, ror #8\n\t"
260 "uxtab16 r7, r7, r7, ror #8\n\t"
262 "uxtb16 r6, r6, ror #8\n\t"
263 "uxtb16 r7, r7, ror #8\n\t"
266 "orr r5, r6, r7, lsl #8\n\t"
269 "uxtb16 r7, r4, ror #8\n\t"
272 "sub r8, %[alpha_mask], r5, lsr #24\n\t"
274 /* multiply by alpha (r8) then by 257 and divide by 65536 */
275 "mla r6, r6, r8, %[component_half]\n\t"
276 "mla r7, r7, r8, %[component_half]\n\t"
278 "uxtab16 r6, r6, r6, ror #8\n\t"
279 "uxtab16 r7, r7, r7, ror #8\n\t"
281 "uxtb16 r6, r6, ror #8\n\t"
282 "uxtb16 r7, r7, ror #8\n\t"
285 "orr r6, r6, r7, lsl #8\n\t"
287 "uqadd8 r5, r6, r5\n\t"
293 "str r5, [%[dest]], #4\n\t"
294 /* increment counter and jmp to top */
295 "subs %[w], %[w], #1\n\t"
298 : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
299 : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
300 [alpha_mask] "r" (alpha_mask)
301 : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
307 arm_composite_over_n_8_8888 (pixman_implementation_t * impl,
309 pixman_image_t * src_image,
310 pixman_image_t * mask_image,
311 pixman_image_t * dst_image,
322 uint32_t *dst_line, *dst;
323 uint8_t *mask_line, *mask;
324 int dst_stride, mask_stride;
327 src = _pixman_image_get_solid (src_image, dst_image->bits.format);
329 /* bail out if fully transparent */
334 uint32_t component_mask = 0xff00ff;
335 uint32_t component_half = 0x800080;
337 uint32_t src_hi = (src >> 8) & component_mask;
338 uint32_t src_lo = src & component_mask;
340 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
341 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
346 dst_line += dst_stride;
348 mask_line += mask_stride;
351 /* #define inner_branch */
357 "ldrb r5, [%[mask]], #1\n\t"
359 /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
360 * The 0x0 case also allows us to avoid doing an unecessary data
361 * write which is more valuable so we only check for that
367 "ldr r4, [%[dest]] \n\t"
369 /* multiply by alpha (r8) then by 257 and divide by 65536 */
370 "mla r6, %[src_lo], r5, %[component_half]\n\t"
371 "mla r7, %[src_hi], r5, %[component_half]\n\t"
373 "uxtab16 r6, r6, r6, ror #8\n\t"
374 "uxtab16 r7, r7, r7, ror #8\n\t"
376 "uxtb16 r6, r6, ror #8\n\t"
377 "uxtb16 r7, r7, ror #8\n\t"
380 "orr r5, r6, r7, lsl #8\n\t"
383 "uxtb16 r7, r4, ror #8\n\t"
385 /* we could simplify this to use 'sub' if we were
386 * willing to give up a register for alpha_mask
389 "mov r8, r8, lsr #24\n\t"
391 /* multiply by alpha (r8) then by 257 and divide by 65536 */
392 "mla r6, r6, r8, %[component_half]\n\t"
393 "mla r7, r7, r8, %[component_half]\n\t"
395 "uxtab16 r6, r6, r6, ror #8\n\t"
396 "uxtab16 r7, r7, r7, ror #8\n\t"
398 "uxtb16 r6, r6, ror #8\n\t"
399 "uxtb16 r7, r7, ror #8\n\t"
402 "orr r6, r6, r7, lsl #8\n\t"
404 "uqadd8 r5, r6, r5\n\t"
410 "str r5, [%[dest]], #4\n\t"
411 /* increment counter and jmp to top */
412 "subs %[w], %[w], #1\n\t"
415 : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
416 : [component_half] "r" (component_half),
417 [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
418 : "r4", "r5", "r6", "r7", "r8", "cc", "memory");
422 static const pixman_fast_path_t arm_simd_fast_paths[] =
424 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, arm_composite_over_8888_8888),
425 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, arm_composite_over_8888_8888),
426 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, arm_composite_over_8888_8888),
427 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, arm_composite_over_8888_8888),
428 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, arm_composite_over_8888_n_8888),
429 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, arm_composite_over_8888_n_8888),
431 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, arm_composite_add_8000_8000),
433 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, arm_composite_over_n_8_8888),
434 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, arm_composite_over_n_8_8888),
435 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, arm_composite_over_n_8_8888),
436 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, arm_composite_over_n_8_8888),
441 pixman_implementation_t *
442 _pixman_implementation_create_arm_simd (void)
444 pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
445 pixman_implementation_t *imp = _pixman_implementation_create (general, arm_simd_fast_paths);