From 3ef203331f124bf137c6e0c8d5516b1209c92dd9 Mon Sep 17 00:00:00 2001 From: Siarhei Siamashka Date: Mon, 22 Mar 2010 21:56:17 +0200 Subject: [PATCH] ARM: SIMD optimizations moved to a separate .S file This should be the last step in providing full armv4t compatibility with CPU features runtime autodetection in pixman. --- configure.ac | 48 +++---- pixman/Makefile.am | 9 +- pixman/pixman-arm-simd-asm.S | 330 +++++++++++++++++++++++++++++++++++++++++++ pixman/pixman-arm-simd.c | 4 + 4 files changed, 359 insertions(+), 32 deletions(-) create mode 100644 pixman/pixman-arm-simd-asm.S diff --git a/configure.ac b/configure.ac index 4668715..ed7d16a 100644 --- a/configure.ac +++ b/configure.ac @@ -361,30 +361,24 @@ AC_SUBST(VMX_CFLAGS) AM_CONDITIONAL(USE_VMX, test $have_vmx_intrinsics = yes) -dnl =========================================================================== -dnl Check for ARM SIMD instructions -ARM_SIMD_CFLAGS="" - +dnl ========================================================================== +dnl Check if assembler is gas compatible and supports ARM SIMD instructions have_arm_simd=no AC_MSG_CHECKING(whether to use ARM SIMD assembler) -# check with default CFLAGS in case the toolchain turns on a sufficiently recent -mcpu= -AC_COMPILE_IFELSE([ -int main () { - asm("uqadd8 r1, r1, r2"); - return 0; -}], have_arm_simd=yes, - # check again with an explicit -mcpu= in case the toolchain defaults to an - # older one; note that uqadd8 isn't available in Thumb mode on arm1136j-s - # so we force ARM mode - ARM_SIMD_CFLAGS="-mcpu=arm1136j-s -marm" - xserver_save_CFLAGS=$CFLAGS - CFLAGS="$ARM_SIMD_CFLAGS $CFLAGS" - AC_COMPILE_IFELSE([ - int main () { - asm("uqadd8 r1, r1, r2"); - return 0; - }], have_arm_simd=yes) - CFLAGS=$xserver_save_CFLAGS) +xserver_save_CFLAGS=$CFLAGS +CFLAGS="-x assembler-with-cpp $CFLAGS" +AC_COMPILE_IFELSE([[ +.text +.arch armv6 +.object_arch armv4 +.arm +.altmacro +#ifndef __ARM_EABI__ +#error EABI is required (to be sure that calling conventions are compatible) +#endif +pld [r0] +uqadd8 r0, r0, r0]], have_arm_simd=yes) +CFLAGS=$xserver_save_CFLAGS AC_ARG_ENABLE(arm-simd, [AC_HELP_STRING([--disable-arm-simd], @@ -396,20 +390,16 @@ if test $enable_arm_simd = no ; then fi if test $have_arm_simd = yes ; then - AC_DEFINE(USE_ARM_SIMD, 1, [use ARM SIMD compiler intrinsics]) -else - ARM_SIMD_CFLAGS= + AC_DEFINE(USE_ARM_SIMD, 1, [use ARM SIMD assembly optimizations]) fi +AM_CONDITIONAL(USE_ARM_SIMD, test $have_arm_simd = yes) + AC_MSG_RESULT($have_arm_simd) if test $enable_arm_simd = yes && test $have_arm_simd = no ; then AC_MSG_ERROR([ARM SIMD intrinsics not detected]) fi -AC_SUBST(ARM_SIMD_CFLAGS) - -AM_CONDITIONAL(USE_ARM_SIMD, test $have_arm_simd = yes) - dnl ========================================================================== dnl Check if assembler is gas compatible and supports NEON instructions have_arm_neon=no diff --git a/pixman/Makefile.am b/pixman/Makefile.am index 5a0e7a9..66ad7f0 100644 --- a/pixman/Makefile.am +++ b/pixman/Makefile.am @@ -97,12 +97,14 @@ endif if USE_ARM_SIMD noinst_LTLIBRARIES += libpixman-arm-simd.la libpixman_arm_simd_la_SOURCES = \ - pixman-arm-simd.c -libpixman_arm_simd_la_CFLAGS = $(DEP_CFLAGS) $(ARM_SIMD_CFLAGS) + pixman-arm-simd.c \ + pixman-arm-common.h \ + pixman-arm-simd-asm.S +libpixman_arm_simd_la_CFLAGS = $(DEP_CFLAGS) libpixman_arm_simd_la_LIBADD = $(DEP_LIBS) libpixman_1_la_LIBADD += libpixman-arm-simd.la -ASM_CFLAGS_arm_simd=$(ARM_SIMD_CFLAGS) +ASM_CFLAGS_arm_simd= endif # arm neon code @@ -110,6 +112,7 @@ if USE_ARM_NEON noinst_LTLIBRARIES += libpixman-arm-neon.la libpixman_arm_neon_la_SOURCES = \ pixman-arm-neon.c \ + pixman-arm-common.h \ pixman-arm-neon-asm.S \ pixman-arm-neon-asm.h libpixman_arm_neon_la_CFLAGS = $(DEP_CFLAGS) diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S new file mode 100644 index 0000000..1a1a0d6 --- /dev/null +++ b/pixman/pixman-arm-simd-asm.S @@ -0,0 +1,330 @@ +/* + * Copyright © 2008 Mozilla Corporation + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of Mozilla Corporation not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission. Mozilla Corporation makes no + * representations about the suitability of this software for any purpose. It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * Author: Jeff Muizelaar (jeff@infidigm.net) + * + */ + +/* Prevent the stack from becoming executable */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + + .text + .arch armv6 + .object_arch armv4 + .arm + .altmacro + +/* Supplementary macro for setting function attributes */ +.macro pixman_asm_function fname + .func fname + .global fname +#ifdef __ELF__ + .hidden fname + .type fname, %function +#endif +fname: +.endm + +/* + * The code below was generated by gcc 4.3.4 from the commented out + * functions in 'pixman-arm-simd.c' file with the following optimization + * options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer" + * + * TODO: replace gcc generated code with hand tuned versions because + * the code quality is not very good, introduce symbolic register + * aliases for better readability and maintainability. + */ + +pixman_asm_function pixman_composite_add_8000_8000_asm_armv6 + push {r4, r5, r6, r7, r8, r9, r10, r11} + mov r10, r1 + sub sp, sp, #4 + subs r10, r10, #1 + mov r11, r0 + mov r8, r2 + str r3, [sp] + ldr r7, [sp, #36] + bcc 0f +6: cmp r11, #0 + beq 1f + orr r3, r8, r7 + tst r3, #3 + beq 2f + mov r1, r8 + mov r0, r7 + mov r12, r11 + b 3f +5: tst r3, #3 + beq 4f +3: ldrb r2, [r0], #1 + subs r12, r12, #1 + ldrb r3, [r1] + uqadd8 r3, r2, r3 + strb r3, [r1], #1 + orr r3, r1, r0 + bne 5b +1: ldr r3, [sp] + add r8, r8, r3 + ldr r3, [sp, #40] + add r7, r7, r3 +10: subs r10, r10, #1 + bcs 6b +0: add sp, sp, #4 + pop {r4, r5, r6, r7, r8, r9, r10, r11} + bx lr +2: mov r12, r11 + mov r1, r8 + mov r0, r7 +4: cmp r12, #3 + subgt r6, r12, #4 + movgt r9, r12 + lsrgt r5, r6, #2 + addgt r3, r5, #1 + movgt r12, #0 + lslgt r4, r3, #2 + ble 7f +8: ldr r3, [r0, r12] + ldr r2, [r1, r12] + uqadd8 r3, r3, r2 + str r3, [r1, r12] + add r12, r12, #4 + cmp r12, r4 + bne 8b + sub r3, r9, #4 + bic r3, r3, #3 + add r3, r3, #4 + subs r12, r6, r5, lsl #2 + add r1, r1, r3 + add r0, r0, r3 + beq 1b +7: mov r4, #0 +9: ldrb r3, [r1, r4] + ldrb r2, [r0, r4] + uqadd8 r3, r2, r3 + strb r3, [r1, r4] + add r4, r4, #1 + cmp r4, r12 + bne 9b + ldr r3, [sp] + add r8, r8, r3 + ldr r3, [sp, #40] + add r7, r7, r3 + b 10b +.endfunc + +pixman_asm_function pixman_composite_over_8888_8888_asm_armv6 + push {r4, r5, r6, r7, r8, r9, r10, r11} + sub sp, sp, #20 + cmp r1, #0 + mov r12, r2 + str r1, [sp, #12] + str r0, [sp, #16] + ldr r2, [sp, #52] + beq 0f + lsl r3, r3, #2 + str r3, [sp] + ldr r3, [sp, #56] + mov r10, #0 + lsl r3, r3, #2 + str r3, [sp, #8] + mov r11, r3 + b 1f +6: ldr r11, [sp, #8] +1: ldr r9, [sp] + mov r0, r12 + add r12, r12, r9 + mov r1, r2 + str r12, [sp, #4] + add r2, r2, r11 + ldr r12, [sp, #16] + ldr r3, =0x00800080 + ldr r9, =0xff00ff00 + mov r11, #255 + cmp r12, #0 + beq 4f +5: ldr r5, [r1], #4 + ldr r4, [r0] + sub r8, r11, r5, lsr #24 + uxtb16 r6, r4 + uxtb16 r7, r4, ror #8 + mla r6, r6, r8, r3 + mla r7, r7, r8, r3 + uxtab16 r6, r6, r6, ror #8 + uxtab16 r7, r7, r7, ror #8 + and r7, r7, r9 + uxtab16 r6, r7, r6, ror #8 + uqadd8 r5, r6, r5 + str r5, [r0], #4 + subs r12, r12, #1 + bne 5b +4: ldr r3, [sp, #12] + add r10, r10, #1 + cmp r10, r3 + ldr r12, [sp, #4] + bne 6b +0: add sp, sp, #20 + pop {r4, r5, r6, r7, r8, r9, r10, r11} + bx lr +.endfunc + +pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6 + push {r4, r5, r6, r7, r8, r9, r10, r11} + sub sp, sp, #28 + cmp r1, #0 + str r1, [sp, #12] + ldrb r1, [sp, #71] + mov r12, r2 + str r0, [sp, #16] + ldr r2, [sp, #60] + str r1, [sp, #24] + beq 0f + lsl r3, r3, #2 + str r3, [sp, #20] + ldr r3, [sp, #64] + mov r10, #0 + lsl r3, r3, #2 + str r3, [sp, #8] + mov r11, r3 + b 1f +5: ldr r11, [sp, #8] +1: ldr r4, [sp, #20] + mov r0, r12 + mov r1, r2 + add r12, r12, r4 + add r2, r2, r11 + str r12, [sp] + str r2, [sp, #4] + ldr r12, [sp, #16] + ldr r2, =0x00800080 + ldr r3, [sp, #24] + mov r11, #255 + cmp r12, #0 + beq 3f +4: ldr r5, [r1], #4 + ldr r4, [r0] + uxtb16 r6, r5 + uxtb16 r7, r5, ror #8 + mla r6, r6, r3, r2 + mla r7, r7, r3, r2 + uxtab16 r6, r6, r6, ror #8 + uxtab16 r7, r7, r7, ror #8 + uxtb16 r6, r6, ror #8 + uxtb16 r7, r7, ror #8 + orr r5, r6, r7, lsl #8 + uxtb16 r6, r4 + uxtb16 r7, r4, ror #8 + sub r8, r11, r5, lsr #24 + mla r6, r6, r8, r2 + mla r7, r7, r8, r2 + uxtab16 r6, r6, r6, ror #8 + uxtab16 r7, r7, r7, ror #8 + uxtb16 r6, r6, ror #8 + uxtb16 r7, r7, ror #8 + orr r6, r6, r7, lsl #8 + uqadd8 r5, r6, r5 + str r5, [r0], #4 + subs r12, r12, #1 + bne 4b +3: ldr r1, [sp, #12] + add r10, r10, #1 + cmp r10, r1 + ldr r12, [sp] + ldr r2, [sp, #4] + bne 5b +0: add sp, sp, #28 + pop {r4, r5, r6, r7, r8, r9, r10, r11} + bx lr +.endfunc + +pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6 + push {r4, r5, r6, r7, r8, r9, r10, r11} + sub sp, sp, #28 + cmp r1, #0 + ldr r9, [sp, #60] + str r1, [sp, #12] + bic r1, r9, #-16777216 + str r1, [sp, #20] + mov r12, r2 + lsr r1, r9, #8 + ldr r2, [sp, #20] + bic r1, r1, #-16777216 + bic r2, r2, #65280 + bic r1, r1, #65280 + str r2, [sp, #20] + str r0, [sp, #16] + str r1, [sp, #4] + ldr r2, [sp, #68] + beq 0f + lsl r3, r3, #2 + str r3, [sp, #24] + mov r0, #0 + b 1f +5: ldr r3, [sp, #24] +1: ldr r4, [sp, #72] + mov r10, r12 + mov r1, r2 + add r12, r12, r3 + add r2, r2, r4 + str r12, [sp, #8] + str r2, [sp] + ldr r12, [sp, #16] + ldr r11, =0x00800080 + ldr r2, [sp, #4] + ldr r3, [sp, #20] + cmp r12, #0 + beq 3f +4: ldrb r5, [r1], #1 + ldr r4, [r10] + mla r6, r3, r5, r11 + mla r7, r2, r5, r11 + uxtab16 r6, r6, r6, ror #8 + uxtab16 r7, r7, r7, ror #8 + uxtb16 r6, r6, ror #8 + uxtb16 r7, r7, ror #8 + orr r5, r6, r7, lsl #8 + uxtb16 r6, r4 + uxtb16 r7, r4, ror #8 + mvn r8, r5 + lsr r8, r8, #24 + mla r6, r6, r8, r11 + mla r7, r7, r8, r11 + uxtab16 r6, r6, r6, ror #8 + uxtab16 r7, r7, r7, ror #8 + uxtb16 r6, r6, ror #8 + uxtb16 r7, r7, ror #8 + orr r6, r6, r7, lsl #8 + uqadd8 r5, r6, r5 + str r5, [r10], #4 + subs r12, r12, #1 + bne 4b +3: ldr r4, [sp, #12] + add r0, r0, #1 + cmp r0, r4 + ldr r12, [sp, #8] + ldr r2, [sp] + bne 5b +0: add sp, sp, #28 + pop {r4, r5, r6, r7, r8, r9, r10, r11} + bx lr +.endfunc diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c index f110753..389c9e0 100644 --- a/pixman/pixman-arm-simd.c +++ b/pixman/pixman-arm-simd.c @@ -30,6 +30,8 @@ #include "pixman-private.h" #include "pixman-arm-common.h" +#if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */ + void pixman_composite_add_8000_8000_asm_armv6 (int32_t width, int32_t height, @@ -371,6 +373,8 @@ pixman_composite_over_n_8_8888_asm_armv6 (int32_t width, } } +#endif + PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8000_8000, uint8_t, 1, uint8_t, 1) PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888, -- 2.7.4