2 * Copyright © 2008 Mozilla Corporation
4 * Permission to use, copy, modify, distribute, and sell this software and its
5 * documentation for any purpose is hereby granted without fee, provided that
6 * the above copyright notice appear in all copies and that both that
7 * copyright notice and this permission notice appear in supporting
8 * documentation, and that the name of Mozilla Corporation not be used in
9 * advertising or publicity pertaining to distribution of the software without
10 * specific, written prior permission. Mozilla Corporation makes no
11 * representations about the suitability of this software for any purpose. It
12 * is provided "as is" without express or implied warranty.
14 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
15 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
16 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
17 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
19 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
20 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23 * Author: Jeff Muizelaar (jeff@infidigm.net)
30 #include "pixman-private.h"
31 #include "pixman-arm-common.h"
32 #include "pixman-inlines.h"
34 #if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */
37 pixman_composite_add_8_8_asm_armv6 (int32_t width,
51 dst_line += dst_stride;
53 src_line += src_stride;
56 /* ensure both src and dst are properly aligned before doing 32 bit reads
57 * we'll stay in this loop if src and dst have differing alignments
59 while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
63 asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
73 asm ("uqadd8 %0, %1, %2"
74 : "=r" (*(uint32_t*)dst)
75 : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst));
85 asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
97 pixman_composite_over_8888_8888_asm_armv6 (int32_t width,
107 uint32_t component_half = 0x800080;
108 uint32_t upper_component_mask = 0xff00ff00;
109 uint32_t alpha_mask = 0xff;
114 dst_line += dst_stride;
116 src_line += src_stride;
119 /* #define inner_branch */
125 "ldr r5, [%[src]], #4\n\t"
127 /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
128 * The 0x0 case also allows us to avoid doing an unecessary data
129 * write which is more valuable so we only check for that
135 "sub r8, %[alpha_mask], r5, lsr #24\n\t"
137 "ldr r4, [%[dest]] \n\t"
140 "ldr r4, [%[dest]] \n\t"
143 "sub r8, %[alpha_mask], r5, lsr #24\n\t"
146 "uxtb16 r7, r4, ror #8\n\t"
148 /* multiply by 257 and divide by 65536 */
149 "mla r6, r6, r8, %[component_half]\n\t"
150 "mla r7, r7, r8, %[component_half]\n\t"
152 "uxtab16 r6, r6, r6, ror #8\n\t"
153 "uxtab16 r7, r7, r7, ror #8\n\t"
155 /* recombine the 0xff00ff00 bytes of r6 and r7 */
156 "and r7, r7, %[upper_component_mask]\n\t"
157 "uxtab16 r6, r7, r6, ror #8\n\t"
159 "uqadd8 r5, r6, r5\n\t"
165 "str r5, [%[dest]], #4\n\t"
166 /* increment counter and jmp to top */
167 "subs %[w], %[w], #1\n\t"
170 : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
171 : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
172 [alpha_mask] "r" (alpha_mask)
173 : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
179 pixman_composite_over_8888_n_8888_asm_armv6 (int32_t width,
190 uint32_t component_half = 0x800080;
191 uint32_t alpha_mask = 0xff;
198 dst_line += dst_stride;
200 src_line += src_stride;
203 /* #define inner_branch */
209 "ldr r5, [%[src]], #4\n\t"
211 /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
212 * The 0x0 case also allows us to avoid doing an unecessary data
213 * write which is more valuable so we only check for that
219 "ldr r4, [%[dest]] \n\t"
222 "uxtb16 r7, r5, ror #8\n\t"
224 /* multiply by alpha (r8) then by 257 and divide by 65536 */
225 "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
226 "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
228 "uxtab16 r6, r6, r6, ror #8\n\t"
229 "uxtab16 r7, r7, r7, ror #8\n\t"
231 "uxtb16 r6, r6, ror #8\n\t"
232 "uxtb16 r7, r7, ror #8\n\t"
235 "orr r5, r6, r7, lsl #8\n\t"
238 "uxtb16 r7, r4, ror #8\n\t"
241 "sub r8, %[alpha_mask], r5, lsr #24\n\t"
243 /* multiply by alpha (r8) then by 257 and divide by 65536 */
244 "mla r6, r6, r8, %[component_half]\n\t"
245 "mla r7, r7, r8, %[component_half]\n\t"
247 "uxtab16 r6, r6, r6, ror #8\n\t"
248 "uxtab16 r7, r7, r7, ror #8\n\t"
250 "uxtb16 r6, r6, ror #8\n\t"
251 "uxtb16 r7, r7, ror #8\n\t"
254 "orr r6, r6, r7, lsl #8\n\t"
256 "uqadd8 r5, r6, r5\n\t"
262 "str r5, [%[dest]], #4\n\t"
263 /* increment counter and jmp to top */
264 "subs %[w], %[w], #1\n\t"
267 : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
268 : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
269 [alpha_mask] "r" (alpha_mask)
270 : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
276 pixman_composite_over_n_8_8888_asm_armv6 (int32_t width,
292 uint32_t component_mask = 0xff00ff;
293 uint32_t component_half = 0x800080;
295 uint32_t src_hi = (src >> 8) & component_mask;
296 uint32_t src_lo = src & component_mask;
301 dst_line += dst_stride;
303 mask_line += mask_stride;
306 /* #define inner_branch */
312 "ldrb r5, [%[mask]], #1\n\t"
314 /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
315 * The 0x0 case also allows us to avoid doing an unecessary data
316 * write which is more valuable so we only check for that
322 "ldr r4, [%[dest]] \n\t"
324 /* multiply by alpha (r8) then by 257 and divide by 65536 */
325 "mla r6, %[src_lo], r5, %[component_half]\n\t"
326 "mla r7, %[src_hi], r5, %[component_half]\n\t"
328 "uxtab16 r6, r6, r6, ror #8\n\t"
329 "uxtab16 r7, r7, r7, ror #8\n\t"
331 "uxtb16 r6, r6, ror #8\n\t"
332 "uxtb16 r7, r7, ror #8\n\t"
335 "orr r5, r6, r7, lsl #8\n\t"
338 "uxtb16 r7, r4, ror #8\n\t"
340 /* we could simplify this to use 'sub' if we were
341 * willing to give up a register for alpha_mask
344 "mov r8, r8, lsr #24\n\t"
346 /* multiply by alpha (r8) then by 257 and divide by 65536 */
347 "mla r6, r6, r8, %[component_half]\n\t"
348 "mla r7, r7, r8, %[component_half]\n\t"
350 "uxtab16 r6, r6, r6, ror #8\n\t"
351 "uxtab16 r7, r7, r7, ror #8\n\t"
353 "uxtb16 r6, r6, ror #8\n\t"
354 "uxtb16 r7, r7, ror #8\n\t"
357 "orr r6, r6, r7, lsl #8\n\t"
359 "uqadd8 r5, r6, r5\n\t"
365 "str r5, [%[dest]], #4\n\t"
366 /* increment counter and jmp to top */
367 "subs %[w], %[w], #1\n\t"
370 : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
371 : [component_half] "r" (component_half),
372 [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
373 : "r4", "r5", "r6", "r7", "r8", "cc", "memory");
379 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
380 uint8_t, 1, uint8_t, 1)
381 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
382 uint32_t, 1, uint32_t, 1)
384 PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888,
385 uint32_t, 1, uint32_t, 1)
387 PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888,
388 uint8_t, 1, uint32_t, 1)
390 PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
392 PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
395 static const pixman_fast_path_t arm_simd_fast_paths[] =
397 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
398 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888),
399 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888),
400 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, armv6_composite_over_8888_8888),
401 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, armv6_composite_over_8888_n_8888),
402 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, armv6_composite_over_8888_n_8888),
403 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888),
404 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888),
406 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8_8),
408 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, armv6_composite_over_n_8_8888),
409 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, armv6_composite_over_n_8_8888),
410 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
411 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
413 PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
414 PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
416 PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, armv6_8888_8888),
417 PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, armv6_8888_8888),
418 PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, armv6_8888_8888),
419 PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, armv6_8888_8888),
420 PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, armv6_8888_8888),
421 PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, armv6_8888_8888),
426 pixman_implementation_t *
427 _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
429 pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths);