2 * Copyright © 2008 Mozilla Corporation
3 * Copyright © 2010 Nokia Corporation
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Mozilla Corporation not be used in
10 * advertising or publicity pertaining to distribution of the software without
11 * specific, written prior permission. Mozilla Corporation makes no
12 * representations about the suitability of this software for any purpose. It
13 * is provided "as is" without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Jeff Muizelaar (jeff@infidigm.net)
28 /* Prevent the stack from becoming executable */
29 #if defined(__linux__) && defined(__ELF__)
30 .section .note.GNU-stack,"",%progbits
39 /* Supplementary macro for setting function attributes */
40 .macro pixman_asm_function fname
45 .type fname, %function
51 * The code below was generated by gcc 4.3.4 from the commented out
52 * functions in 'pixman-arm-simd.c' file with the following optimization
53 * options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer"
55 * TODO: replace gcc generated code with hand tuned versions because
56 * the code quality is not very good, introduce symbolic register
57 * aliases for better readability and maintainability.
60 pixman_asm_function pixman_composite_add_8_8_asm_armv6
61 push {r4, r5, r6, r7, r8, r9, r10, r11}
95 pop {r4, r5, r6, r7, r8, r9, r10, r11}
118 subs r12, r6, r5, lsl #2
137 pixman_asm_function pixman_composite_over_8888_8888_asm_armv6
138 push {r4, r5, r6, r7, r8, r9, r10, r11}
169 sub r8, r11, r5, lsr #24
171 uxtb16 r7, r4, ror #8
174 uxtab16 r6, r6, r6, ror #8
175 uxtab16 r7, r7, r7, ror #8
177 uxtab16 r6, r7, r6, ror #8
188 pop {r4, r5, r6, r7, r8, r9, r10, r11}
192 pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6
193 push {r4, r5, r6, r7, r8, r9, r10, r11}
228 uxtb16 r7, r5, ror #8
231 uxtab16 r6, r6, r6, ror #8
232 uxtab16 r7, r7, r7, ror #8
233 uxtb16 r6, r6, ror #8
234 uxtb16 r7, r7, ror #8
235 orr r5, r6, r7, lsl #8
237 uxtb16 r7, r4, ror #8
238 sub r8, r11, r5, lsr #24
241 uxtab16 r6, r6, r6, ror #8
242 uxtab16 r7, r7, r7, ror #8
243 uxtb16 r6, r6, ror #8
244 uxtb16 r7, r7, ror #8
245 orr r6, r6, r7, lsl #8
257 pop {r4, r5, r6, r7, r8, r9, r10, r11}
261 pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
262 push {r4, r5, r6, r7, r8, r9, r10, r11}
267 bic r1, r9, #-16777216
272 bic r1, r1, #-16777216
302 uxtab16 r6, r6, r6, ror #8
303 uxtab16 r7, r7, r7, ror #8
304 uxtb16 r6, r6, ror #8
305 uxtb16 r7, r7, ror #8
306 orr r5, r6, r7, lsl #8
308 uxtb16 r7, r4, ror #8
313 uxtab16 r6, r6, r6, ror #8
314 uxtab16 r7, r7, r7, ror #8
315 uxtb16 r6, r6, ror #8
316 uxtb16 r7, r7, ror #8
317 orr r6, r6, r7, lsl #8
329 pop {r4, r5, r6, r7, r8, r9, r10, r11}
334 * Note: This function is only using armv4t instructions (not even armv6),
335 * but is scheduled for ARM Cortex-A8 pipeline. So it might need to
336 * be split into a few variants, tuned for each microarchitecture.
338 * TODO: In order to get good performance on ARM9/ARM11 cores (which don't
339 * have efficient write combining), it needs to be changed to use 16-byte
340 * aligned writes using STM instruction.
342 pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
353 push {r4, r5, r6, r7}
356 /* define helper macro */
357 .macro scale_2_pixels
358 ldrh TMP1, [SRC, TMP1]
359 and TMP2, VXMASK, VX, lsr #15
363 ldrh TMP2, [SRC, TMP2]
364 and TMP1, VXMASK, VX, lsr #15
369 /* now do the scaling */
370 and TMP1, VXMASK, VX, lsr #15
374 1: /* main loop, process 4 pixels per iteration */
385 ldrneh TMP1, [SRC, TMP1]
386 strneh TMP1, [DST], #2
387 /* cleanup helper macro */
388 .purgem scale_2_pixels