sysdeps/arm/armv6t2/strlen.S: strlen implementation for armv6t2.
authorWill Newton <will.newton@linaro.org>
Fri, 9 Aug 2013 08:36:41 +0000 (09:36 +0100)
committerWill Newton <will.newton@linaro.org>
Fri, 30 Aug 2013 09:04:57 +0000 (10:04 +0100)
This implementation of strlen is faster than the armv6 version for
all string lengths greater than 1 on a Cortex-A15.

ports/ChangeLog.arm:

2013-08-09  Will Newton  <will.newton@linaro.org>

* sysdeps/arm/armv6t2/strlen.S: New file.

ports/ChangeLog.arm
ports/sysdeps/arm/armv6t2/strlen.S [new file with mode: 0644]

index 6dad08d..8236f0f 100644 (file)
@@ -1,3 +1,7 @@
+2013-08-30  Will Newton  <will.newton@linaro.org>
+
+       * sysdeps/arm/armv6t2/strlen.S: New file.
+
 2013-08-29  Thomas Schwinge  <thomas@codesourcery.com>
 
        * sysdeps/unix/sysv/linux/arm/ldsodefs.h (VALID_ELF_OSABI)
diff --git a/ports/sysdeps/arm/armv6t2/strlen.S b/ports/sysdeps/arm/armv6t2/strlen.S
new file mode 100644 (file)
index 0000000..a52e2e7
--- /dev/null
@@ -0,0 +1,141 @@
+/* Copyright (C) 2010-2011,2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/*
+   Assumes:
+   ARMv6T2, AArch32
+
+ */
+
+#include <sysdep.h>
+
+#ifdef __ARMEB__
+#define S2LO           lsl
+#define S2HI           lsr
+#else
+#define S2LO           lsr
+#define S2HI           lsl
+#endif
+
+       /* This code requires Thumb.  */
+       .thumb
+       .syntax unified
+
+/* Parameters and result.  */
+#define srcin          r0
+#define result         r0
+
+/* Internal variables.  */
+#define src            r1
+#define data1a         r2
+#define data1b         r3
+#define const_m1       r12
+#define const_0                r4
+#define tmp1           r4              /* Overlaps const_0  */
+#define tmp2           r5
+
+       .text
+       .p2align 6
+ENTRY(strlen)
+       pld     [srcin, #0]
+       strd    r4, r5, [sp, #-8]!
+       cfi_adjust_cfa_offset (8)
+       cfi_rel_offset (r4, 0)
+       cfi_rel_offset (r5, 4)
+       cfi_remember_state
+       bic     src, srcin, #7
+       mvn     const_m1, #0
+       ands    tmp1, srcin, #7         /* (8 - bytes) to alignment.  */
+       pld     [src, #32]
+       bne.w   .Lmisaligned8
+       mov     const_0, #0
+       mov     result, #-8
+.Lloop_aligned:
+       /* Bytes 0-7.  */
+       ldrd    data1a, data1b, [src]
+       pld     [src, #64]
+       add     result, result, #8
+.Lstart_realigned:
+       uadd8   data1a, data1a, const_m1        /* Saturating GE<0:3> set.  */
+       sel     data1a, const_0, const_m1       /* Select based on GE<0:3>.  */
+       uadd8   data1b, data1b, const_m1
+       sel     data1b, data1a, const_m1        /* Only used if d1a == 0.  */
+       cbnz    data1b, .Lnull_found
+
+       /* Bytes 8-15.  */
+       ldrd    data1a, data1b, [src, #8]
+       uadd8   data1a, data1a, const_m1        /* Saturating GE<0:3> set.  */
+       add     result, result, #8
+       sel     data1a, const_0, const_m1       /* Select based on GE<0:3>.  */
+       uadd8   data1b, data1b, const_m1
+       sel     data1b, data1a, const_m1        /* Only used if d1a == 0.  */
+       cbnz    data1b, .Lnull_found
+
+       /* Bytes 16-23.  */
+       ldrd    data1a, data1b, [src, #16]
+       uadd8   data1a, data1a, const_m1        /* Saturating GE<0:3> set.  */
+       add     result, result, #8
+       sel     data1a, const_0, const_m1       /* Select based on GE<0:3>.  */
+       uadd8   data1b, data1b, const_m1
+       sel     data1b, data1a, const_m1        /* Only used if d1a == 0.  */
+       cbnz    data1b, .Lnull_found
+
+       /* Bytes 24-31.  */
+       ldrd    data1a, data1b, [src, #24]
+       add     src, src, #32
+       uadd8   data1a, data1a, const_m1        /* Saturating GE<0:3> set.  */
+       add     result, result, #8
+       sel     data1a, const_0, const_m1       /* Select based on GE<0:3>.  */
+       uadd8   data1b, data1b, const_m1
+       sel     data1b, data1a, const_m1        /* Only used if d1a == 0.  */
+       cmp     data1b, #0
+       beq     .Lloop_aligned
+
+.Lnull_found:
+       cmp     data1a, #0
+       itt     eq
+       addeq   result, result, #4
+       moveq   data1a, data1b
+#ifndef __ARMEB__
+       rev     data1a, data1a
+#endif
+       clz     data1a, data1a
+       ldrd    r4, r5, [sp], #8
+       cfi_adjust_cfa_offset (-8)
+       cfi_restore (r4)
+       cfi_restore (r5)
+       add     result, result, data1a, lsr #3  /* Bits -> Bytes.  */
+       DO_RET(lr)
+
+.Lmisaligned8:
+       cfi_restore_state
+       ldrd    data1a, data1b, [src]
+       and     tmp2, tmp1, #3
+       rsb     result, tmp1, #0
+       lsl     tmp2, tmp2, #3                  /* Bytes -> bits.  */
+       tst     tmp1, #4
+       pld     [src, #64]
+       S2HI    tmp2, const_m1, tmp2
+       orn     data1a, data1a, tmp2
+       itt     ne
+       ornne   data1b, data1b, tmp2
+       movne   data1a, const_m1
+       mov     const_0, #0
+       b       .Lstart_realigned
+
+END(strlen)
+libc_hidden_builtin_def (strlen)