powerpc: Optimized strncmp for POWER8/PPC64

author Adhemerval Zanella <azanella@linux.vnet.ibm.com>

Fri, 9 Jan 2015 21:04:26 +0000 (16:04 -0500)

committer Adhemerval Zanella <azanella@linux.vnet.ibm.com>

Tue, 13 Jan 2015 19:35:40 +0000 (14:35 -0500)
author Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Fri, 9 Jan 2015 21:04:26 +0000 (16:04 -0500)
committer Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Tue, 13 Jan 2015 19:35:40 +0000 (14:35 -0500)
diff --git a/ChangeLog b/ChangeLog

index 79e971e..c1e8055 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+2015-01-13  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+       * sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S: New file.
+       * sysdeps/powerpc/powerpc64/power8/strncmp.S: New file.
+       * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
+       strncmp-power8 object.
+       * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+       (__libc_ifunc_impl_list): Add __strncmp_power8 implementation.
+       * sysdeps/powerpc/powerpc64/multiarch/strncmp.c (strncmp): Likewise.
+       * NEWS: Update.
+
  2015-01-13  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>
             Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
  
diff --git a/NEWS b/NEWS

index e9f5034..3bdc96a 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -19,8 +19,8 @@ Version 2.21
    17744, 17745, 17746, 17747, 17748, 17775, 17777, 17780, 17781, 17782,
    17791, 17793, 17796, 17797, 17803, 17806, 17834
  
-* Optimized strcpy, stpcpy, strncpy, stpncpy, and strcmp implementations for
-  powerpc64/powerpc64le.
+* Optimized strcpy, stpcpy, strncpy, stpncpy, strcmp, and strncmp
+  implementations for powerpc64/powerpc64le.
    Implemented by Adhemerval Zanella (IBM).
  
  * Added support for TSX lock elision of pthread mutexes on powerpc32, powerpc64
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile

index ec4fca7..b7ea284 100644 (file)
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -7,8 +7,9 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
                    memrchr-power7 memrchr-ppc64 rawmemchr-power7 \
                    rawmemchr-ppc64 strlen-power7 strlen-ppc64 strnlen-power7 \
                    strnlen-ppc64 strcasecmp-power7 strcasecmp_l-power7 \
-                  strncase-power7 strncase_l-power7 strncmp-power7 \
-                  strncmp-power4 strncmp-ppc64 strchr-power7 strchr-ppc64 \
+                  strncase-power7 strncase_l-power7 \
+                  strncmp-power8 strncmp-power7 strncmp-power4 strncmp-ppc64 \
+                  strchr-power7 strchr-ppc64 \
                    strchrnul-power7 strchrnul-ppc64 wcschr-power7 \
                    wcschr-power6 wcschr-ppc64 wcsrchr-power7 wcsrchr-power6 \
                    wcsrchr-ppc64 wcscpy-power7 wcscpy-power6 wcscpy-ppc64 \
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c

index 2c03060..f5fdea5 100644 (file)
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -108,6 +108,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
  
    /* Support sysdeps/powerpc/powerpc64/multiarch/strncmp.c.  */
    IFUNC_IMPL (i, name, strncmp,
+             IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+                             __strncmp_power8)
               IFUNC_IMPL_ADD (array, i, strncmp, hwcap & PPC_FEATURE_HAS_VSX,
                               __strncmp_power7)
               IFUNC_IMPL_ADD (array, i, strncmp, hwcap & PPC_FEATURE_POWER4,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S

new file mode 100644 (file)

index 0000000..8d7223d
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S
@@ -0,0 +1,40 @@
+/* Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#undef EALIGN
+#define EALIGN(name,alignt,words)                              \
+  .section ".text";                                            \
+  ENTRY_2(__strncmp_power8)                                    \
+  .align ALIGNARG(alignt);                                     \
+  EALIGN_W_##words;                                            \
+  BODY_LABEL(__strncmp_power8):                                        \
+  cfi_startproc;                                               \
+  LOCALENTRY(__strncmp_power8)
+
+#undef END
+#define END(name)                                              \
+  cfi_endproc;                                                 \
+  TRACEBACK(__strncmp_power8)                                  \
+  END_2(__strncmp_power8)
+
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strncmp.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c

index eb02aac..9b6a659 100644 (file)
--- a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
@@ -25,13 +25,16 @@
  extern __typeof (strncmp) __strncmp_ppc attribute_hidden;
  extern __typeof (strncmp) __strncmp_power4 attribute_hidden;
  extern __typeof (strncmp) __strncmp_power7 attribute_hidden;
+extern __typeof (strncmp) __strncmp_power8 attribute_hidden;
  
  /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
     ifunc symbol properly.  */
  libc_ifunc (strncmp,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strncmp_power7 :
-             (hwcap & PPC_FEATURE_POWER4)
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+            ? __strncmp_power8 :
+              (hwcap & PPC_FEATURE_HAS_VSX)
+              ? __strncmp_power7 :
+               (hwcap & PPC_FEATURE_POWER4)
                 ? __strncmp_power4
              : __strncmp_ppc);
  #endif
diff --git a/sysdeps/powerpc/powerpc64/power8/strncmp.S b/sysdeps/powerpc/powerpc64/power8/strncmp.S

new file mode 100644 (file)

index 0000000..56c814b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strncmp.S
@@ -0,0 +1,323 @@
+/* Optimized strncmp implementation for PowerPC64/POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Implements the function
+
+   int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n)
+
+   The implementation uses unaligned doubleword access to avoid specialized
+   code paths depending of data alignment.  Although recent powerpc64 uses
+   64K as default, the page cross handling assumes minimum page size of
+   4k.  */
+
+       .machine  power7
+EALIGN (strncmp, 4, 0)
+       /* Check if size is 0.  */
+       mr.     r10,r5
+       beq     cr0,L(ret0)
+
+       /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
+          the code:
+
+           (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
+
+          with PAGE_SIZE being 4096 and ITER_SIZE begin 16.  */
+       rldicl  r8,r3,0,52
+       cmpldi  cr7,r8,4096-16
+       bgt     cr7,L(pagecross)
+       rldicl  r9,r4,0,52
+       cmpldi  cr7,r9,4096-16
+       bgt     cr7,L(pagecross)
+
+       /* For short string up to 16 bytes, load both s1 and s2 using
+          unaligned dwords and compare.  */
+       ld      r7,0(r3)
+       ld      r9,0(r4)
+       li      r8,0
+       cmpb    r8,r7,r8
+       cmpb    r6,r7,r9
+       orc.    r8,r8,r6
+       bne     cr0,L(different1)
+
+       /* If the string compared are equal, but size is less or equal
+          to 8, return 0.  */
+       cmpldi  cr7,r10,8
+       li      r9,0
+       ble     cr7,L(ret1)
+       addi    r5,r10,-8
+
+       ld      r7,8(r3)
+       ld      r9,8(r4)
+       cmpb    r8,r7,r8
+       cmpb    r6,r7,r9
+       orc.    r8,r8,r6
+       bne     cr0,L(different0)
+
+       cmpldi  cr7,r5,8
+       mr      r9,r8
+       ble     cr7,L(ret1)
+
+       /* Update pointers and size.  */
+       addi    r10,r10,-16
+       addi    r3,r3,16
+       addi    r4,r4,16
+
+       /* Now it has checked for first 16 bytes, align source1 to doubleword
+          and adjust source2 address.  */
+L(align_8b):
+       rldicl  r5,r3,0,61
+       rldicr  r3,r3,0,60
+       subf    r4,r5,r4
+       add     r10,r10,r5
+
+       /* At this point, source1 alignment is 0 and source2 alignment is
+          between 0 and 7.  Check is source2 alignment is 0, meaning both
+          sources have the same alignment.  */
+       andi.   r8,r4,0x7
+       beq     cr0,L(loop_eq_align_0)
+
+       li      r5,0
+       b       L(loop_ne_align_1)
+
+       /* If source2 is unaligned to doubleword, the code needs to check
+          on each interation if the unaligned doubleword access will cross
+          a 4k page boundary.  */
+       .align 4
+L(loop_ne_align_0):
+       ld      r7,0(r3)
+       ld      r9,0(r4)
+       cmpb    r8,r7,r5
+       cmpb    r6,r7,r9
+       orc.    r8,r8,r6
+       bne     cr0,L(different1)
+
+       cmpldi  cr7,r10,8
+       ble     cr7,L(ret0)
+       addi    r10,r10,-8
+       addi    r3,r3,8
+       addi    r4,r4,8
+L(loop_ne_align_1):
+       rldicl  r9,r4,0,52
+       cmpldi  r7,r9,4088
+       ble     cr7,L(loop_ne_align_0)
+       cmpdi   cr7,r10,0
+       beq     cr7,L(ret0)
+
+       lbz     r9,0(r3)
+       lbz     r8,0(r4)
+       cmplw   cr7,r9,r8
+       bne     cr7,L(byte_ne_4)
+       cmpdi   cr7,r9,0
+       beq     cr7,L(size_reached_0)
+
+       li      r9,r7
+       addi    r8,r3,1
+       mtctr   r9
+       addi    r4,r4,1
+       addi    r10,r10,-1
+       addi    r3,r3,8
+
+       /* The unaligned read of source2 will cross a 4K page boundary,
+          and the different byte or NULL maybe be in the remaining page
+          bytes.  Since it can not use the unaligned load the algorithm
+          reads and compares 8 bytes to keep source1 doubleword aligned.  */
+       .align 4
+L(loop_ne_align_byte):
+       cmpdi   cr7,r10,0
+       addi    r10,r10,-1
+       beq     cr7,L(ret0)
+       lbz     r9,0(r8)
+       lbz     r7,0(r4)
+       addi    r8,r8,1
+       addi    r4,r4,1
+       cmplw   cr7,r9,r7
+       cmpdi   cr5,r9,0
+       bne     cr7,L(size_reached_2)
+       beq     cr5,L(size_reached_0)
+       bdnz    L(loop_ne_align_byte)
+
+       cmpdi   cr7,r10,0
+       bne+    cr7,L(loop_ne_align_0)
+
+       .align 4
+L(ret0):
+       li      r9,0
+L(ret1):
+       mr      r3,r9
+       blr
+
+       /* The code now check if r8 and r10 are different by issuing a
+          cmpb and shift the result based on its output:
+
+       #ifdef __LITTLE_ENDIAN__
+         leadzero = (__builtin_ffsl (z1) - 1);
+         leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero;
+         r1 = (r1 >> leadzero) & 0xFFUL;
+         r2 = (r2 >> leadzero) & 0xFFUL;
+       #else
+         leadzero = __builtin_clzl (z1);
+         leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero;
+         r1 = (r1 >> (56 - leadzero)) & 0xFFUL;
+         r2 = (r2 >> (56 - leadzero)) & 0xFFUL;
+       #endif
+         return r1 - r2;  */
+
+       .align 4
+L(different0):
+       mr      r10,r5
+#ifdef __LITTLE_ENDIAN__
+L(different1):
+        neg    r11,r8
+        sldi   r10,r10,3
+        and    r8,r11,r8
+        addi   r10,r10,-8
+        cntlzd r8,r8
+        subfic r8,r8,63
+        extsw  r8,r8
+        cmpld  cr7,r8,r10
+        ble    cr7,L(different2)
+        mr     r8,r10
+L(different2):
+        extsw  r8,r8
+#else
+L(different1):
+       addi    r10,r10,-1
+       cntlzd  r8,r8
+       sldi    r10,r10,3
+       cmpld   cr7,r8,r10
+       blt     cr7,L(different2)
+       mr      r8,r10
+L(different2):
+       subfic  r8,r8,56
+#endif
+       srd     r7,r7,r8
+       srd     r9,r9,r8
+       rldicl  r3,r7,0,56
+       rldicl  r9,r9,0,56
+       subf    r9,r9,3
+       extsw   r9,r9
+       mr      r3,r9
+       blr
+
+       /* If unaligned 16 bytes reads across a 4K page boundary, it uses
+          a simple byte a byte comparison until the page alignment for s1
+          is reached.  */
+       .align 4
+L(pagecross):
+       lbz     r7,0(r3)
+       lbz     r9,0(r4)
+       subfic  r8,r8,4095
+       cmplw   cr7,r9,r7
+       bne     cr7,L(byte_ne_3)
+       cmpdi   cr7,r9,0
+       beq     cr7,L(byte_ne_0)
+       addi    r10,r10,-1
+       subf    r7,r8,r10
+       subf    r9,r7,r10
+       addi    r9,r9,1
+       mtctr   r9
+       b       L(pagecross_loop1)
+
+       .align 4
+L(pagecross_loop0):
+       beq     cr7,L(ret0)
+       lbz     r9,0(r3)
+       lbz     r8,0(r4)
+       addi    r10,r10,-1
+       cmplw   cr7,r9,r8
+       cmpdi   cr5,r9,0
+       bne     r7,L(byte_ne_2)
+       beq     r5,L(byte_ne_0)
+L(pagecross_loop1):
+       cmpdi   cr7,r10,0
+       addi    r3,r3,1
+       addi    r4,r4,1
+       bdnz    L(pagecross_loop0)
+       cmpdi   cr7,r7,0
+       li      r9,0
+       bne+    cr7,L(align_8b)
+       b       L(ret1)
+
+       /* If both source1 and source2 are doubleword aligned, there is no
+          need for page boundary cross checks.  */
+       .align 4
+L(loop_eq_align_0):
+       ld      r7,0(r3)
+       ld      r9,0(r4)
+       cmpb    r8,r7,r8
+       cmpb    r6,r7,r9
+       orc.    r8,r8,r6
+       bne     cr0,L(different1)
+
+       cmpldi  cr7,r10,8
+       ble     cr7,L(ret0)
+       addi    r9,r10,-9
+
+       li      r5,0
+       srdi    r9,r9,3
+       addi    r9,r9,1
+       mtctr   r9
+       b       L(loop_eq_align_2)
+
+       .align 4
+L(loop_eq_align_1):
+       bdz     L(ret0)
+L(loop_eq_align_2):
+       ldu     r7,8(r3)
+       addi    r10,r10,-8
+       ldu     r9,8(r4)
+       cmpb    r8,r7,r5
+       cmpb    r6,r7,r9
+       orc.    r8,r8,r6
+       beq     cr0,L(loop_eq_align_1)
+       b       L(different1)
+
+       .align 4
+L(byte_ne_0):
+       li      r7,0
+L(byte_ne_1):
+       subf    r9,r9,r7
+       extsw   r9,r9
+       b       L(ret1)
+
+       .align 4
+L(byte_ne_2):
+       extsw   r7,r9
+       mr      r9,r8
+       b       L(byte_ne_1)
+L(size_reached_0):
+       li      r10,0
+L(size_reached_1):
+       subf    r9,r9,r10
+       extsw   r9,r9
+       b       L(ret1)
+L(size_reached_2):
+       extsw   r10,r9
+       mr      r9,r7
+       b       L(size_reached_1)
+L(byte_ne_3):
+       extsw   r7,r7
+       b       L(byte_ne_1)
+L(byte_ne_4):
+       extsw   r10,r9
+       mr      r9,r8
+       b       L(size_reached_1)
+END(strncmp)
+libc_hidden_builtin_def(strncmp)
author	Adhemerval Zanella <azanella@linux.vnet.ibm.com>
	Fri, 9 Jan 2015 21:04:26 +0000 (16:04 -0500)
committer	Adhemerval Zanella <azanella@linux.vnet.ibm.com>
	Tue, 13 Jan 2015 19:35:40 +0000 (14:35 -0500)
ChangeLog		patch \| blob \| history
NEWS		patch \| blob \| history
sysdeps/powerpc/powerpc64/multiarch/Makefile		patch \| blob \| history
sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c		patch \| blob \| history
sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S	[new file with mode: 0644]	patch \| blob
sysdeps/powerpc/powerpc64/multiarch/strncmp.c		patch \| blob \| history
sysdeps/powerpc/powerpc64/power8/strncmp.S	[new file with mode: 0644]	patch \| blob