2 * Copyright © 2008 Mozilla Corporation
3 * Copyright © 2010 Nokia Corporation
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Mozilla Corporation not be used in
10 * advertising or publicity pertaining to distribution of the software without
11 * specific, written prior permission. Mozilla Corporation makes no
12 * representations about the suitability of this software for any purpose. It
13 * is provided "as is" without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Jeff Muizelaar (jeff@infidigm.net)
28 /* Prevent the stack from becoming executable */
29 #if defined(__linux__) && defined(__ELF__)
30 .section .note.GNU-stack,"",%progbits
40 /* Supplementary macro for setting function attributes */
41 .macro pixman_asm_function fname
46 .type fname, %function
52 * The code below was generated by gcc 4.3.4 from the commented out
53 * functions in 'pixman-arm-simd.c' file with the following optimization
54 * options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer"
56 * TODO: replace gcc generated code with hand tuned versions because
57 * the code quality is not very good, introduce symbolic register
58 * aliases for better readability and maintainability.
61 pixman_asm_function pixman_composite_add_8_8_asm_armv6
62 push {r4, r5, r6, r7, r8, r9, r10, r11}
96 pop {r4, r5, r6, r7, r8, r9, r10, r11}
119 subs r12, r6, r5, lsl #2
138 pixman_asm_function pixman_composite_over_8888_8888_asm_armv6
139 push {r4, r5, r6, r7, r8, r9, r10, r11}
170 sub r8, r11, r5, lsr #24
172 uxtb16 r7, r4, ror #8
175 uxtab16 r6, r6, r6, ror #8
176 uxtab16 r7, r7, r7, ror #8
178 uxtab16 r6, r7, r6, ror #8
189 pop {r4, r5, r6, r7, r8, r9, r10, r11}
193 pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6
194 push {r4, r5, r6, r7, r8, r9, r10, r11}
229 uxtb16 r7, r5, ror #8
232 uxtab16 r6, r6, r6, ror #8
233 uxtab16 r7, r7, r7, ror #8
234 uxtb16 r6, r6, ror #8
235 uxtb16 r7, r7, ror #8
236 orr r5, r6, r7, lsl #8
238 uxtb16 r7, r4, ror #8
239 sub r8, r11, r5, lsr #24
242 uxtab16 r6, r6, r6, ror #8
243 uxtab16 r7, r7, r7, ror #8
244 uxtb16 r6, r6, ror #8
245 uxtb16 r7, r7, ror #8
246 orr r6, r6, r7, lsl #8
258 pop {r4, r5, r6, r7, r8, r9, r10, r11}
262 pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
263 push {r4, r5, r6, r7, r8, r9, r10, r11}
268 bic r1, r9, #-16777216
273 bic r1, r1, #-16777216
303 uxtab16 r6, r6, r6, ror #8
304 uxtab16 r7, r7, r7, ror #8
305 uxtb16 r6, r6, ror #8
306 uxtb16 r7, r7, ror #8
307 orr r5, r6, r7, lsl #8
309 uxtb16 r7, r4, ror #8
314 uxtab16 r6, r6, r6, ror #8
315 uxtab16 r7, r7, r7, ror #8
316 uxtb16 r6, r6, ror #8
317 uxtb16 r7, r7, ror #8
318 orr r6, r6, r7, lsl #8
330 pop {r4, r5, r6, r7, r8, r9, r10, r11}
335 * Note: This code is only using armv5te instructions (not even armv6),
336 * but is scheduled for ARM Cortex-A8 pipeline. So it might need to
337 * be split into a few variants, tuned for each microarchitecture.
339 * TODO: In order to get good performance on ARM9/ARM11 cores (which don't
340 * have efficient write combining), it needs to be changed to use 16-byte
341 * aligned writes using STM instruction.
343 * Nearest scanline scaler macro template uses the following arguments:
344 * fname - name of the function to generate
345 * bpp_shift - (1 << bpp_shift) is the size of pixel in bytes
346 * t - type suffix for LDR/STR instructions
347 * prefetch_distance - prefetch in the source image by that many
349 * prefetch_braking_distance - stop prefetching when that many pixels are
350 * remaining before the end of scanline
353 .macro generate_nearest_scanline_func fname, bpp_shift, t, \
355 prefetch_braking_distance
357 pixman_asm_function fname
369 push {r4, r5, r6, r7}
370 mvn VXMASK, #((1 << bpp_shift) - 1)
372 /* define helper macro */
373 .macro scale_2_pixels
374 ldr&t TMP1, [SRC, TMP1]
375 and TMP2, VXMASK, VX, lsr #(16 - bpp_shift)
377 str&t TMP1, [DST], #(1 << bpp_shift)
379 ldr&t TMP2, [SRC, TMP2]
380 and TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
382 str&t TMP2, [DST], #(1 << bpp_shift)
385 /* now do the scaling */
386 and TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
388 subs W, W, #(8 + prefetch_braking_distance)
390 /* calculate prefetch offset */
391 mov PF_OFFS, #prefetch_distance
392 mla PF_OFFS, UNIT_X, PF_OFFS, VX
393 1: /* main loop, process 8 pixels per iteration with prefetch */
395 add PF_OFFS, UNIT_X, lsl #3
400 pld [SRC, PF_OFFS, lsr #(16 - bpp_shift)]
403 subs W, W, #(4 - 8 - prefetch_braking_distance)
405 1: /* process the remaining pixels */
416 ldrne&t TMP1, [SRC, TMP1]
418 /* cleanup helper macro */
419 .purgem scale_2_pixels
435 generate_nearest_scanline_func \
436 pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
438 generate_nearest_scanline_func \
439 pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32