powerpc: Optimized st{r,p}ncpy for POWER8/PPC64
authorAdhemerval Zanella <azanella@linux.vnet.ibm.com>
Wed, 31 Dec 2014 16:47:41 +0000 (11:47 -0500)
committerAdhemerval Zanella <azanella@linux.vnet.ibm.com>
Tue, 13 Jan 2015 16:28:44 +0000 (11:28 -0500)
This patch adds an optimized POWER8 st{r,p}ncpy using unaligned accesses.
It shows 10%-80% improvement over the optimized POWER7 one that uses
only aligned accesses, specially on unaligned inputs.

The algorithm first read and check 16 bytes (if inputs do not cross a 4K
page size).  The it realign source to 16-bytes and issue a 16 bytes read
and compare loop to speedup null byte checks for large strings.  Also,
different from POWER7 optimization, the null pad is done inline in the
implementation using possible unaligned accesses, instead of realying on
a memset call.  Special case is added for page cross reads.

ChangeLog
NEWS
sysdeps/powerpc/powerpc64/multiarch/Makefile
sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S [new file with mode: 0644]
sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S [new file with mode: 0644]
sysdeps/powerpc/powerpc64/multiarch/strncpy.c
sysdeps/powerpc/powerpc64/power8/stpncpy.S [new file with mode: 0644]
sysdeps/powerpc/powerpc64/power8/strncpy.S [new file with mode: 0644]

index 16199e3..20aded4 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,20 @@
 2015-01-13  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
+       * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+       Add strncpy-power8 and stpncpy-power8 objects.
+       * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+       (__libc_ifunc_impl_list): Add __strncpy_power8 and stpncpy_power8
+       implementations.
+       * sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S: New file.
+       * sysdeps/powerpc/powerpc64/multiarch/stpncpy.c (__stpncpy): Add
+       __stpncpy_power8 implementation.
+       * sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S: New file.
+       * sysdeps/powerpc/powerpc64/multiarch/strncpy.c (strncpy): Add
+       __strncpy_power8 implementation.
+       * sysdeps/powerpc/powerpc64/power8/stpncpy.S: New file.
+       * sysdeps/powerpc/powerpc64/power8/strncpy.S: New file.
+       * NEWS: Update.
+
        * sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c: New file.
        * sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S: Remove file.
        * sysdeps/powerpc/powerpc64/power7/strncat.S: Likewise.
diff --git a/NEWS b/NEWS
index e020918..08b3daa 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -19,7 +19,8 @@ Version 2.21
   17744, 17745, 17746, 17747, 17748, 17775, 17777, 17780, 17781, 17782,
   17791, 17793, 17796, 17797, 17803, 17806, 17834
 
-* Optimized strcpy and stpcpy implementations for powerpc64/powerpc64le.
+* Optimized strcpy, stpcpy, strncpy, stpncpy implementations for
+  powerpc64/powerpc64le.
 
 * Added support for TSX lock elision of pthread mutexes on powerpc32, powerpc64
   and powerpc64le.  This may improve lock scaling of existing programs on
index 74b2daa..18d3378 100644 (file)
@@ -17,9 +17,10 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
                   stpcpy-power7 stpcpy-ppc64 \
                   strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
                   strncpy-power7 strncpy-ppc64 \
-                  stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
+                  stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \
+                  strcmp-power7 strcmp-ppc64 \
                   strcat-power8 strcat-power7 strcat-ppc64 memmove-power7 \
-                  memmove-ppc64 bcopy-ppc64
+                  memmove-ppc64 bcopy-ppc64 strncpy-power8
 
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
index dbb21fd..132cb13 100644 (file)
@@ -279,6 +279,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c.  */
   IFUNC_IMPL (i, name, strncpy,
              IFUNC_IMPL_ADD (array, i, strncpy,
+                             hwcap2 & PPC_FEATURE2_ARCH_2_07,
+                             __strncpy_power8)
+             IFUNC_IMPL_ADD (array, i, strncpy,
                              hwcap & PPC_FEATURE_HAS_VSX,
                              __strncpy_power7)
              IFUNC_IMPL_ADD (array, i, strncpy, 1,
@@ -287,6 +290,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c.  */
   IFUNC_IMPL (i, name, stpncpy,
              IFUNC_IMPL_ADD (array, i, stpncpy,
+                             hwcap2 & PPC_FEATURE2_ARCH_2_07,
+                             __stpncpy_power8)
+             IFUNC_IMPL_ADD (array, i, stpncpy,
                              hwcap & PPC_FEATURE_HAS_VSX,
                              __stpncpy_power7)
              IFUNC_IMPL_ADD (array, i, stpncpy, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S
new file mode 100644 (file)
index 0000000..d5d835d
--- /dev/null
@@ -0,0 +1,39 @@
+/* Optimized stpncpy implementation for POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define USE_AS_STPNCPY
+
+#undef EALIGN
+#define EALIGN(name, alignt, words)                            \
+  .section ".text";                                            \
+  ENTRY_2(__stpncpy_power8)                                    \
+  .align ALIGNARG(alignt);                                     \
+  EALIGN_W_##words;                                            \
+  BODY_LABEL(__stpncpy_power8):                                        \
+  cfi_startproc;                                               \
+  LOCALENTRY(__stpncpy_power8)
+
+#undef END
+#define END(name)                                              \
+  cfi_endproc;                                                 \
+  TRACEBACK(__stpncpy_power8)                                  \
+  END_2(__stpncpy_power8)
+
+#include <sysdeps/powerpc/powerpc64/power8/stpncpy.S>
index 9e5a270..0f4072f 100644 (file)
 
 extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
 extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
+extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
 
 libc_ifunc (__stpncpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __stpncpy_power7
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+            ? __stpncpy_power8 :
+              (hwcap & PPC_FEATURE_HAS_VSX)
+              ? __stpncpy_power7
             : __stpncpy_ppc);
 
 weak_alias (__stpncpy, stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S
new file mode 100644 (file)
index 0000000..ed906a4
--- /dev/null
@@ -0,0 +1,40 @@
+/* Optimized strncpy implementation for POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#undef EALIGN
+#define EALIGN(name, alignt, words)                            \
+  .section ".text";                                            \
+  ENTRY_2(__strncpy_power8)                                    \
+  .align ALIGNARG(alignt);                                     \
+  EALIGN_W_##words;                                            \
+  BODY_LABEL(__strncpy_power8):                                        \
+  cfi_startproc;                                               \
+  LOCALENTRY(__strncpy_power8)
+
+#undef END
+#define END(name)                                              \
+  cfi_endproc;                                                 \
+  TRACEBACK(__strncpy_power8)                                  \
+  END_2(__strncpy_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strncpy.S>
index ae4e97a..ffb0f23 100644 (file)
 
 extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
 extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
+extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
  ifunc symbol properly. */
 libc_ifunc (strncpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strncpy_power7
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+            ? __strncpy_power8 :
+              (hwcap & PPC_FEATURE_HAS_VSX)
+              ? __strncpy_power7
             : __strncpy_ppc);
 
 #endif
diff --git a/sysdeps/powerpc/powerpc64/power8/stpncpy.S b/sysdeps/powerpc/powerpc64/power8/stpncpy.S
new file mode 100644 (file)
index 0000000..76a1466
--- /dev/null
@@ -0,0 +1,20 @@
+/* Optimized stpncpy implementation for PowerPC64/POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/power8/strncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
new file mode 100644 (file)
index 0000000..5fda953
--- /dev/null
@@ -0,0 +1,424 @@
+/* Optimized strncpy/stpncpy implementation for PowerPC64/POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifdef USE_AS_STPNCPY
+# define FUNC_NAME __stpncpy
+#else
+# define FUNC_NAME strncpy
+#endif
+
+/* Implements the function
+
+   char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+   or
+
+   char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+   if USE_AS_STPCPY is defined.
+
+   The implementation uses unaligned doubleword access to avoid specialized
+   code paths depending of data alignment.  Although recent powerpc64 uses
+   64K as default, the page cross handling assumes minimum page size of
+   4k.  */
+
+       .machine  power7
+EALIGN (FUNC_NAME, 4, 0)
+
+        /* Check if the [src]+15 will cross a 4K page by checking if the bit
+           indicating the page size changes.  Basically:
+
+           uint64_t srcin = (uint64_t)src;
+           uint64_t ob = srcin & 4096UL;
+           uint64_t nb = (srcin+15UL) & 4096UL;
+           if (ob ^ nb)
+             goto pagecross;  */
+
+       addi    r10,r4,16
+       rlwinm  r9,r4,0,19,19
+
+       /* Since it is a leaf function, save some non-volatile registers on the
+          protected/red zone.  */
+       std     r26,-48(r1)
+       std     r27,-40(r1)
+
+       rlwinm  r8,r10,0,19,19
+
+       std     r28,-32(r1)
+       std     r29,-24(r1)
+
+       cmpld   r7,r9,r8
+
+       std     r30,-16(r1)
+       std     r31,-8(r1)
+
+       beq     cr7,L(unaligned_lt_16)
+       rldicl  r9,r4,0,61
+       subfic  r8,r9,8
+       cmpld   cr7,r5,r8
+       bgt     cr7,L(pagecross)
+
+       /* At this points there is 1 to 15 bytes to check and write.  Since it could
+          be either from first unaligned 16 bytes access or from bulk copy, the code
+          uses an unrolled byte read/write instead of trying to analyze the cmpb
+          results.  */
+L(short_path):
+       mr      r9,r3
+L(short_path_1):
+       cmpdi   cr7,r5,0
+       beq     cr7,L(short_path_loop_end_1)
+L(short_path_2):
+       lbz     r10,0(r4)
+       cmpdi   cr7,r10,0
+       stb     r10,0(r9)
+       beq     cr7,L(zero_pad_start_1)
+       cmpdi   cr0,r5,1
+       addi    r8,r9,1
+       addi    r6,r5,-1
+       beq     cr0,L(short_path_loop_end_0)
+       lbz     r10,1(r4)
+       cmpdi   cr7,r10,0
+       stb     r10,1(r9)
+       beq     cr7,L(zero_pad_start_prepare_1)
+       addi    r10,r5,-3
+       b       L(short_path_loop_1)
+
+       .align  4
+L(short_path_loop):
+       lbz     r8,0(r4)
+       addi    r7,r10,-2
+       cmpdi   cr5,r8,0
+       stb     r8,0(r9)
+       beq     cr5,L(zero_pad_start_1)
+       beq     r7,L(short_path_loop_end_0)
+       lbz     r8,1(r4)
+       cmpdi   cr7,r8,0
+       stb     r8,1(r9)
+       beq     cr7,L(zero_pad_start)
+       mr      r10,r7
+L(short_path_loop_1):
+       addic.  r5,r5,-2
+       addi    r9,r9,2
+       cmpdi   cr7,r10,0
+       addi    r4,r4,2
+       addi    r6,r9,1
+       bne     cr0,L(short_path_loop)
+#ifdef USE_AS_STPNCPY
+       mr      r3,r9
+       b       L(short_path_loop_end)
+#endif
+
+L(short_path_loop_end_0):
+#ifdef USE_AS_STPNCPY
+       addi    r3,r9,1
+       b       L(short_path_loop_end)
+#endif
+L(short_path_loop_end_1):
+#ifdef USE_AS_STPNCPY
+       mr      r3,r9
+#endif
+L(short_path_loop_end):
+       /* Restore non-volatile registers.  */
+       ld      r26,-48(r1)
+       ld      r27,-40(r1)
+       ld      r28,-32(r1)
+       ld      r29,-24(r1)
+       ld      r30,-16(r1)
+       ld      r31,-8(r1)
+       blr
+
+       /* This code pads the remainder dest with NULL bytes.  The algorithm
+          calculate the remanining size and issues a doubleword unrolled
+          loops followed by a byte a byte set.  */
+       .align  4
+L(zero_pad_start):
+       mr      r5,r10
+       mr      r9,r6
+L(zero_pad_start_1):
+       srdi.   r8,r5,r3
+       mr      r10,r9
+#ifdef USE_AS_STPNCPY
+       mr      r3,r9
+#endif
+       beq-    cr0,L(zero_pad_loop_b_start)
+       cmpldi  cr7,r8,1
+       li      cr7,0
+       std     r7,0(r9)
+       beq     cr7,L(zero_pad_loop_b_prepare)
+       addic.  r8,r8,-2
+       addi    r10,r9,r16
+       std     r7,8(r9)
+       beq     cr0,L(zero_pad_loop_dw_2)
+       std     r7,16(r9)
+       li      r9,0
+       b       L(zero_pad_loop_dw_1)
+
+       .align  4
+L(zero_pad_loop_dw):
+       addi    r10,r10,16
+       std     r9,-8(r10)
+       beq     cr0,L(zero_pad_loop_dw_2)
+       std     r9,0(r10)
+L(zero_pad_loop_dw_1):
+       cmpldi  cr7,r8,1
+       std     r9,0(r10)
+       addic.  r8,r8,-2
+       bne     cr7,L(zero_pad_loop_dw)
+       addi    r10,r10,8
+L(zero_pad_loop_dw_2):
+       rldicl  r5,r5,0,61
+L(zero_pad_loop_b_start):
+       cmpdi   cr7,r5,0
+       addi    r5,r5,-1
+       addi    r9,r10,-1
+       add     r10,r10,5
+       subf    r10,r9,r10
+       li      r8,0
+       beq-    cr7,L(short_path_loop_end)
+
+       /* Write remaining 1-8 bytes.  */
+        .align  4
+       addi    r9,r9,1
+       mtocrf  0x1,r10
+       bf      29,4f
+        stw     r8,0(r9)
+        addi   r9,r9,4
+
+        .align  4
+4:      bf      30,2f
+        sth     r8,0(r9)
+        addi   r9,r9,2
+
+        .align  4
+2:      bf     31,1f
+        stb    r8,0(r9)
+
+       /* Restore non-volatile registers.  */
+1:     ld      r26,-48(r1)
+       ld      r27,-40(r1)
+       ld      r28,-32(r1)
+       ld      r29,-24(r1)
+       ld      r30,-16(r1)
+       ld      r31,-8(r1)
+       blr
+
+       /* The common case where [src]+16 will not cross a 4K page boundary.
+          In this case the code fast check the first 16 bytes by using doubleword
+          read/compares and update destiny if neither total size or null byte
+          is found in destiny. */
+       .align  4
+L(unaligned_lt_16):
+       cmpldi  cr7,r5,7
+       ble     cr7,L(short_path)
+       ld      r7,0(r4)
+       li      r8,0
+       cmpb    r8,r7,r8
+       cmpdi   cr7,r8,0
+       bne     cr7,L(short_path_prepare_2)
+       addi    r6,r5,-8
+       std     r7,0(r3)
+       addi    r9,r3,r8
+       cmpldi  cr7,r6,7
+       addi    r7,r4,8
+       ble     cr7,L(short_path_prepare_1_1)
+       ld      r4,8(r4)
+       cmpb    r8,r4,r8
+       cmpdi   cr7,r8,0
+       bne     cr7,L(short_path_prepare_2_1)
+       std     r4,8(r3)
+       addi    r29,r3,16
+       addi    r5,r5,-16
+       /* Neither the null byte was found or total length was reached,
+          align to 16 bytes and issue a bulk copy/compare.  */
+       b       L(align_to_16b)
+
+       /* In the case of 4k page boundary cross, the algorithm first align
+          the address to a doubleword, calculate a mask based on alignment
+          to ignore the bytes and continue using doubleword.  */
+       .align  4
+L(pagecross):
+       rldicr  r11,r4,0,59     /* Align the address to 8 bytes boundary.  */
+       li      r6,-1           /* MASK = 0xffffffffffffffffUL.  */
+       sldi    r9,r9,3         /* Calculate padding.  */
+       ld      r7,0(r11)       /* Load doubleword from memory.  */
+#ifdef __LITTLE_ENDIAN__
+       sld     r9,r6,r9        /* MASK = MASK << padding.  */
+#else
+       srd     r9,r6,r9        /* MASK = MASK >> padding.  */
+#endif
+       orc     r9,r7,r9        /* Mask bits that are not part of the
+                                  string.  */
+       li      cr7,0
+       cmpb    r9,r9,r7        /* Check for null bytes in DWORD1.  */
+       cmpdi   cr7,r9,0
+       bne     cr7,L(short_path_prepare_2)
+       subf    r8,r8,r5        /* Adjust total length.  */
+       cmpldi  cr7,r8,8        /* Check if length was reached.  */
+       ble     cr7,L(short_path_prepare_2)
+
+       /* For next checks we have aligned address, so we check for more
+          three doublewords to make sure we can read 16 unaligned bytes
+          to start the bulk copy with 16 aligned addresses.  */
+       ld      cr7,8(r11)
+       cmpb    r9,r7,r9
+       cmpdi   cr7,r9,0
+       bne     cr7,L(short_path_prepare_2)
+       addi    cr7,r8,-8
+       cmpldi  cr7,r7,8
+       ble     cr7,L(short_path_prepare_2)
+       ld      cr7,16(r11)
+       cmpb    r9,r7,r9
+       cmpdi   cr7,r9,0
+       bne     cr7,L(short_path_prepare_2)
+       addi    r8,r8,-16
+       cmpldi  r7,r8,8
+       ble     cr7,L(short_path_prepare_2)
+       ld      r8,24(r11)
+       cmpb    r9,r8,r9
+       cmpdi   r7,r9,0
+       bne     cr7,L(short_path_prepare_2)
+
+       /* No null byte found in the 32 bytes readed and length not reached,
+          read source again using unaligned loads and store them.  */
+       ld      r9,0(r4)
+       addi    r29,r3,16
+       addi    r5,r5,-16
+       std     r9,0(r3)
+       ld      r9,8(r4)
+       std     r9,8(r3)
+
+       /* Align source to 16 bytes and adjust destiny and size.  */
+L(align_to_16b):
+       rldicl  r9,r10,0,60
+       rldicr  r28,r10,0,59
+       add     r12,r5,r9
+       subf    r29,r9,r29
+
+       /* The bulk read/compare/copy loads two doublewords, compare and merge
+          in a single register for speed.  This is an attempt to speed up the
+          null-checking process for bigger strings.  */
+
+       cmpldi  cr7,r12,15
+       ble     cr7,L(short_path_prepare_1_2)
+
+       /* Main loop for large sizes, unrolled 2 times to get better use of
+          pipeline.  */
+       ld      r8,0(28)
+       ld      r10,8(28)
+       li      r9,0
+       cmpb    r7,r8,r9
+       cmpb    r9,r10,r9
+       or.     r6,r9,r7
+       bne     cr0,L(short_path_prepare_2_3)
+       addi    r5,r12,-16
+       addi    r4,r28,16
+       std     r8,0(r29)
+       std     r10,8(r29)
+       cmpldi  cr7,r5,15
+       addi    r9,r29,16
+       ble     cr7,L(short_path_1)
+       mr      r11,r28
+       mr      r6,r29
+       li      r30,0
+       subfic  r26,r4,48
+       subfic  r27,r9,48
+
+       b       L(loop_16b)
+
+       .align  4
+L(loop_start):
+       ld      r31,0(r11)
+       ld      r10,8(r11)
+       cmpb    r0,r31,r7
+       cmpb    r8,r10,r7
+       or.     r7,r0,r8
+       addi    r5,r5,-32
+       cmpldi  cr7,r5,15
+       add     r4,r4,r26
+       add     r9,r9,r27
+       bne     cr0,L(short_path_prepare_2_2)
+       add     r4,r28,r4
+       std     r31,0(r6)
+       add     r9,r29,r9
+       std     r10,8(r6)
+       ble     cr7,L(short_path_1)
+
+L(loop_16b):
+       ld      r10,16(r11)
+       ld      r0,24(r11)
+       cmpb    r8,r10,r30
+       cmpb    r7,r0,r30
+       or.     r7,r8,r7
+       addi    r12,r12,-32
+       cmpldi  r7,r12,15
+       addi    r11,r11,32
+       bne     cr0,L(short_path_2)
+       std     r10,16(r6)
+       addi    r6,r6,32
+       std     r0,-8(r6)
+       bgt     cr7,L(loop_start)
+
+       mr      r5,r12
+       mr      r4,r11
+       mr      r9,r6
+       b       L(short_path_1)
+
+       .align  4
+L(short_path_prepare_1_1):
+       mr      r5,r6
+       mr      r4,r7
+       b       L(short_path_1)
+L(short_path_prepare_1_2):
+       mr      r5,r12
+       mr      r4,r28
+       mr      r9,r29
+       b       L(short_path_1)
+L(short_path_prepare_2):
+       mr      r9,r3
+       b       L(short_path_2)
+L(short_path_prepare_2_1):
+       mr      r5,r6
+       mr      r4,r7
+       b       L(short_path_2)
+L(short_path_prepare_2_2):
+       mr      r5,r12
+       mr      r4,r11
+       mr      r9,r6
+       b       L(short_path_2)
+L(short_path_prepare_2_3):
+       mr      r5,r12
+       mr      r4,r28
+       mr      r9,r29
+       b       L(short_path_2)
+L(zero_pad_loop_b_prepare):
+       addi    r10,r9,8
+       rldicl  r5,r5,0,61
+       b       L(zero_pad_loop_b_start)
+L(zero_pad_start_prepare_1):
+       mr      r5,r6
+       mr      r9,r8
+       b       L(zero_pad_start_1)
+END (FUNC_NAME)
+
+#ifdef USE_AS_STPNCPY
+libc_hidden_def (__stpncpy)
+#else
+libc_hidden_builtin_def (strncpy)
+#endif