PowerPC: optimized memmove for POWER7/PPC64

author Adhemerval Zanella <azanella@linux.vnet.ibm.com>

Fri, 20 Jun 2014 17:55:16 +0000 (12:55 -0500)

committer Adhemerval Zanella <azanella@linux.vnet.ibm.com>

Mon, 7 Jul 2014 20:41:21 +0000 (15:41 -0500)
author Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Fri, 20 Jun 2014 17:55:16 +0000 (12:55 -0500)
committer Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Mon, 7 Jul 2014 20:41:21 +0000 (15:41 -0500)
diff --git a/ChangeLog b/ChangeLog

index a6032da..abead37 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,25 @@
  2014-07-07  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
  
+       * string/bcopy.c: Use full path to include memmove.c.
+       * sysdeps/powerpc/powerpc64/multiarch/Makefile: Add memmove and bcopy
+       multiarch objects.
+       * sysdeps/powerpc/powerpc64/multiarch/bcopy-ppc64.c: New file: default
+       bcopy for powerpc64.
+       * sysdeps/powerpc/powerpc64/multiarch/bcopy.c: New file: multiarch
+       bcopy for powerpc64.
+       * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c: Add bcopy
+       and memmove implementations.
+       * sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S: New file:
+       optimized multiarch memmove for POWER7/powerpc64.
+       * sysdeps/powerpc/powerpc64/multiarch/memmove-ppc64.c: New file:
+       default multiarch memmove for powerpc64.
+       * sysdeps/powerpc/powerpc64/multiarch/memmove.c: New file: memmove
+       multiarch for powerpc64.
+       * sysdeps/powerpc/powerpc64/power7/bcopy.c: New file: optimized bcopy
+       for POWER7/powerpc64.
+       * sysdeps/powerpc/powerpc64/power7/memmove.S: New file: optimized
+       memmove for POWER7/powerpc64.
+
         * sysdeps/powerpc/memmove.c (memmove): Cleanup impplementation to use
         glibc default one.
  
diff --git a/string/bcopy.c b/string/bcopy.c

index 7c1225c..f497b5d 100644 (file)
--- a/string/bcopy.c
+++ b/string/bcopy.c
@@ -25,4 +25,4 @@
  #define        a2              dest
  #define        a2const
  
-#include <memmove.c>
+#include <string/memmove.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile

index 524055f..82722fb 100644 (file)
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -18,7 +18,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
                    strspn-power7 strspn-ppc64 strcspn-power7 strcspn-ppc64 \
                    strpbrk-power7 strpbrk-ppc64 strncpy-power7 strncpy-ppc64 \
                    stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
-                  strcat-power7 strcat-ppc64
+                  strcat-power7 strcat-ppc64 memmove-power7 memmove-ppc64 \
+                  bcopy-ppc64
  
  CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
  CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bcopy-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/bcopy-ppc64.c

new file mode 100644 (file)

index 0000000..f8d5aa5
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/bcopy-ppc64.c
@@ -0,0 +1,25 @@
+/* PowerPC64 default bcopy.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+
+extern __typeof (bcopy) __bcopy_ppc attribute_hidden;
+
+#define bcopy __bcopy_ppc
+
+#include <string/bcopy.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c

new file mode 100644 (file)

index 0000000..0688907
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
@@ -0,0 +1,29 @@
+/* PowerPC64 multiarch bcopy.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+#include "init-arch.h"
+
+extern __typeof (bcopy) __bcopy_ppc attribute_hidden;
+/* __bcopy_power7 symbol is implemented at memmove-power7.S  */
+extern __typeof (bcopy) __bcopy_power7 attribute_hidden;
+
+libc_ifunc (bcopy,
+            (hwcap & PPC_FEATURE_HAS_VSX)
+            ? __bcopy_power7
+            : __bcopy_ppc);
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c

index ec2e027..a574487 100644 (file)
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -61,6 +61,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
                               __memcpy_power4)
               IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_ppc))
  
+  /* Support sysdeps/powerpc/powerpc64/multiarch/memmove.c.  */
+  IFUNC_IMPL (i, name, memmove,
+             IFUNC_IMPL_ADD (array, i, memmove, hwcap & PPC_FEATURE_HAS_VSX,
+                             __memmove_power7)
+             IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ppc))
+
    /* Support sysdeps/powerpc/powerpc64/multiarch/memset.c.  */
    IFUNC_IMPL (i, name, memset,
               IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_HAS_VSX,
@@ -136,6 +142,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
                               __bzero_power4)
               IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_ppc))
  
+  /* Support sysdeps/powerpc/powerpc64/multiarch/bcopy.c.  */
+  IFUNC_IMPL (i, name, bcopy,
+             IFUNC_IMPL_ADD (array, i, bcopy, hwcap & PPC_FEATURE_HAS_VSX,
+                             __bcopy_power7)
+             IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ppc))
+
    /* Support sysdeps/powerpc/powerpc64/multiarch/mempcpy.c.  */
    IFUNC_IMPL (i, name, mempcpy,
               IFUNC_IMPL_ADD (array, i, mempcpy,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S b/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S

new file mode 100644 (file)

index 0000000..667e7bc
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
@@ -0,0 +1,43 @@
+/* Optimized memmove implementation for PowerPC64/POWER7.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#undef EALIGN
+#define EALIGN(name, alignt, words)                            \
+  .section ".text";                                            \
+  ENTRY_2(__memmove_power7)                                    \
+  .align ALIGNARG(alignt);                                     \
+  EALIGN_W_##words;                                            \
+  BODY_LABEL(__memmove_power7):                                        \
+  cfi_startproc;                                               \
+  LOCALENTRY(__memmove_power7)
+
+#undef END_GEN_TB
+#define END_GEN_TB(name, mask)                                 \
+  cfi_endproc;                                                 \
+  TRACEBACK_MASK(__memmove_power7,mask)                                \
+  END_2(__memmove_power7)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#undef bcopy
+#define bcopy __bcopy_power7
+
+#include <sysdeps/powerpc/powerpc64/power7/memmove.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/memmove-ppc64.c

new file mode 100644 (file)

index 0000000..b39348f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove-ppc64.c
@@ -0,0 +1,28 @@
+/* Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+
+#define MEMMOVE __memmove_ppc
+#if !defined(NOT_IN_libc) && defined(SHARED)
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)
+#endif
+
+extern __typeof (memmove) __memmove_ppc attribute_hidden;
+
+#include <sysdeps/powerpc/memmove.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove.c b/sysdeps/powerpc/powerpc64/multiarch/memmove.c

new file mode 100644 (file)

index 0000000..9a1ce8f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
@@ -0,0 +1,45 @@
+/* Multiple versions of memmove. PowerPC64 version.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  In static binaries we need memmove before the initialization
+   happened.  */
+#if defined SHARED && !defined NOT_IN_libc
+/* Redefine memmove so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+# undef memmove
+# define memmove __redirect_memmove
+# include <string.h>
+# include "init-arch.h"
+
+extern __typeof (__redirect_memmove) __libc_memmove;
+
+extern __typeof (__redirect_memmove) __memmove_ppc attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_power7 attribute_hidden;
+
+libc_ifunc (__libc_memmove,
+            (hwcap & PPC_FEATURE_HAS_VSX)
+            ? __memmove_power7
+            : __memmove_ppc);
+
+#undef memmove
+strong_alias (__libc_memmove, memmove);
+libc_hidden_ver (__libc_memmove, memmove);
+#else
+# include <string/memmove.c>
+#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/bcopy.c b/sysdeps/powerpc/powerpc64/power7/bcopy.c

new file mode 100644 (file)

index 0000000..4a6a400
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/bcopy.c
@@ -0,0 +1 @@
+/* Implemented at memmove.S  */
diff --git a/sysdeps/powerpc/powerpc64/power7/memmove.S b/sysdeps/powerpc/powerpc64/power7/memmove.S

new file mode 100644 (file)

index 0000000..8663dfb
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/memmove.S
@@ -0,0 +1,831 @@
+/* Optimized memmove implementation for PowerPC64/POWER7.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+
+/* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5])
+
+   This optimization check if memory 'dest'  overlaps with 'src'. If it does
+   not then it calls an optimized memcpy call (similar to memcpy for POWER7,
+   embedded here to gain some cycles).
+   If source and destiny overlaps, a optimized backwards memcpy is used
+   instead.  */
+
+       .machine power7
+EALIGN (memmove, 5, 0)
+       CALL_MCOUNT 3
+
+L(_memmove):
+       subf    r9,r4,r3
+       cmpld   cr7,r9,r5
+       blt     cr7,L(memmove_bwd)
+
+       cmpldi  cr1,r5,31
+       neg     0,3
+       ble     cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
+                                      code.  */
+
+       andi.   10,3,15
+       clrldi  11,4,60
+       cmpld   cr6,10,11       /* SRC and DST alignments match?  */
+
+       mr      r11,3
+       bne     cr6,L(copy_GE_32_unaligned)
+       beq     L(aligned_copy)
+
+       mtocrf  0x01,0
+       clrldi  0,0,60
+
+/* Get the DST and SRC aligned to 8 bytes (16 for little-endian).  */
+1:
+       bf      31,2f
+       lbz     6,0(r4)
+       addi    r4,r4,1
+       stb     6,0(r11)
+       addi    r11,r11,1
+2:
+       bf      30,4f
+       lhz     6,0(r4)
+       addi    r4,r4,2
+       sth     6,0(r11)
+       addi    r11,r11,2
+4:
+       bf      29,8f
+       lwz     6,0(r4)
+       addi    r4,r4,4
+       stw     6,0(r11)
+       addi    r11,r11,4
+8:
+       bf      28,16f
+       ld      6,0(r4)
+       addi    r4,r4,8
+       std     6,0(r11)
+       addi    r11,r11,8
+16:
+       subf    r5,0,r5
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy):
+       li      6,16
+       li      7,32
+       li      8,48
+       mtocrf  0x02,r5
+       srdi    12,r5,7
+       cmpdi   12,0
+       beq     L(aligned_tail)
+       lxvd2x  6,0,r4
+       lxvd2x  7,r4,6
+       mtctr   12
+       b       L(aligned_128loop)
+
+       .align  4
+L(aligned_128head):
+       /* for the 2nd + iteration of this loop. */
+       lxvd2x  6,0,r4
+       lxvd2x  7,r4,6
+L(aligned_128loop):
+       lxvd2x  8,r4,7
+       lxvd2x  9,r4,8
+       stxvd2x 6,0,r11
+       addi    r4,r4,64
+       stxvd2x 7,r11,6
+       stxvd2x 8,r11,7
+       stxvd2x 9,r11,8
+       lxvd2x  6,0,r4
+       lxvd2x  7,r4,6
+       addi    r11,r11,64
+       lxvd2x  8,r4,7
+       lxvd2x  9,r4,8
+       addi    r4,r4,64
+       stxvd2x 6,0,r11
+       stxvd2x 7,r11,6
+       stxvd2x 8,r11,7
+       stxvd2x 9,r11,8
+       addi    r11,r11,64
+       bdnz    L(aligned_128head)
+
+L(aligned_tail):
+       mtocrf  0x01,r5
+       bf      25,32f
+       lxvd2x  6,0,r4
+       lxvd2x  7,r4,6
+       lxvd2x  8,r4,7
+       lxvd2x  9,r4,8
+       addi    r4,r4,64
+       stxvd2x 6,0,r11
+       stxvd2x 7,r11,6
+       stxvd2x 8,r11,7
+       stxvd2x 9,r11,8
+       addi    r11,r11,64
+32:
+       bf      26,16f
+       lxvd2x  6,0,r4
+       lxvd2x  7,r4,6
+       addi    r4,r4,32
+       stxvd2x 6,0,r11
+       stxvd2x 7,r11,6
+       addi    r11,r11,32
+16:
+       bf      27,8f
+       lxvd2x  6,0,r4
+       addi    r4,r4,16
+       stxvd2x 6,0,r11
+       addi    r11,r11,16
+8:
+       bf      28,4f
+       ld      6,0(r4)
+       addi    r4,r4,8
+       std     6,0(r11)
+       addi    r11,r11,8
+4:     /* Copies 4~7 bytes.  */
+       bf      29,L(tail2)
+       lwz     6,0(r4)
+       stw     6,0(r11)
+       bf      30,L(tail5)
+       lhz     7,4(r4)
+       sth     7,4(r11)
+       bflr    31
+       lbz     8,6(r4)
+       stb     8,6(r11)
+       /* Return original DST pointer.  */
+       blr
+
+/* Handle copies of 0~31 bytes.  */
+       .align  4
+L(copy_LT_32):
+       mr      r11,3
+       cmpldi  cr6,r5,8
+       mtocrf  0x01,r5
+       ble     cr6,L(copy_LE_8)
+
+       /* At least 9 bytes to go.  */
+       neg     8,4
+       andi.   0,8,3
+       cmpldi  cr1,r5,16
+       beq     L(copy_LT_32_aligned)
+
+       /* Force 4-byte alignment for SRC.  */
+       mtocrf  0x01,0
+       subf    r5,0,r5
+2:
+       bf      30,1f
+       lhz     6,0(r4)
+       addi    r4,r4,2
+       sth     6,0(r11)
+       addi    r11,r11,2
+1:
+       bf      31,L(end_4bytes_alignment)
+       lbz     6,0(r4)
+       addi    r4,r4,1
+       stb     6,0(r11)
+       addi    r11,r11,1
+
+       .align  4
+L(end_4bytes_alignment):
+       cmpldi  cr1,r5,16
+       mtocrf  0x01,r5
+
+L(copy_LT_32_aligned):
+       /* At least 6 bytes to go, and SRC is word-aligned.  */
+       blt     cr1,8f
+
+       /* Copy 16 bytes.  */
+       lwz     6,0(r4)
+       lwz     7,4(r4)
+       stw     6,0(r11)
+       lwz     8,8(r4)
+       stw     7,4(r11)
+       lwz     6,12(r4)
+       addi    r4,r4,16
+       stw     8,8(r11)
+       stw     6,12(r11)
+       addi    r11,r11,16
+8:     /* Copy 8 bytes.  */
+       bf      28,L(tail4)
+       lwz     6,0(r4)
+       lwz     7,4(r4)
+       addi    r4,r4,8
+       stw     6,0(r11)
+       stw     7,4(r11)
+       addi    r11,r11,8
+
+       .align  4
+/* Copies 4~7 bytes.  */
+L(tail4):
+       bf      29,L(tail2)
+       lwz     6,0(r4)
+       stw     6,0(r11)
+       bf      30,L(tail5)
+       lhz     7,4(r4)
+       sth     7,4(r11)
+       bflr    31
+       lbz     8,6(r4)
+       stb     8,6(r11)
+       /* Return original DST pointer.  */
+       blr
+
+       .align  4
+/* Copies 2~3 bytes.  */
+L(tail2):
+       bf      30,1f
+       lhz     6,0(r4)
+       sth     6,0(r11)
+       bflr    31
+       lbz     7,2(r4)
+       stb     7,2(r11)
+       blr
+
+       .align  4
+L(tail5):
+       bflr    31
+       lbz     6,4(r4)
+       stb     6,4(r11)
+       blr
+
+       .align  4
+1:
+       bflr    31
+       lbz     6,0(r4)
+       stb     6,0(r11)
+       /* Return original DST pointer.  */
+       blr
+
+/* Handles copies of 0~8 bytes.  */
+       .align  4
+L(copy_LE_8):
+       bne     cr6,L(tail4)
+
+       /* Though we could've used ld/std here, they are still
+       slow for unaligned cases.  */
+
+       lwz     6,0(r4)
+       lwz     7,4(r4)
+       stw     6,0(r11)
+       stw     7,4(r11)
+       blr
+
+
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+   SRC is not. Use aligned quadword loads from SRC, shifted to realign
+   the data, allowing for aligned DST stores.  */
+       .align  4
+L(copy_GE_32_unaligned):
+       clrldi  0,0,60        /* Number of bytes until the 1st r11 quadword.  */
+       srdi    9,r5,4        /* Number of full quadwords remaining.  */
+
+       beq     L(copy_GE_32_unaligned_cont)
+
+       /* DST is not quadword aligned, get it aligned.  */
+
+       mtocrf  0x01,0
+       subf    r5,0,r5
+
+       /* Vector instructions work best when proper alignment (16-bytes)
+       is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
+1:
+       bf      31,2f
+       lbz     6,0(r4)
+       addi    r4,r4,1
+       stb     6,0(r11)
+       addi    r11,r11,1
+2:
+       bf      30,4f
+       lhz     6,0(r4)
+       addi    r4,r4,2
+       sth     6,0(r11)
+       addi    r11,r11,2
+4:
+       bf      29,8f
+       lwz     6,0(r4)
+       addi    r4,r4,4
+       stw     6,0(r11)
+       addi    r11,r11,4
+8:
+       bf      28,0f
+       ld      6,0(r4)
+       addi    r4,r4,8
+       std     6,0(r11)
+       addi    r11,r11,8
+0:
+       srdi    9,r5,4        /* Number of full quadwords remaining.  */
+
+       /* The proper alignment is present, it is OK to copy the bytes now.  */
+L(copy_GE_32_unaligned_cont):
+
+       /* Setup two indexes to speed up the indexed vector operations.  */
+       clrldi  10,r5,60
+       li      6,16          /* Index for 16-bytes offsets.  */
+       li      7,32          /* Index for 32-bytes offsets.  */
+       cmpldi  cr1,10,0
+       srdi    8,r5,5        /* Setup the loop counter.  */
+       mtocrf  0x01,9
+       cmpldi  cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+       lvsr    5,0,r4
+#else
+       lvsl    5,0,r4
+#endif
+       lvx     3,0,r4
+       li      0,0
+       bf      31,L(setup_unaligned_loop)
+
+       /* Copy another 16 bytes to align to 32-bytes due to the loop.  */
+       lvx     4,r4,6
+#ifdef __LITTLE_ENDIAN__
+       vperm   6,4,3,5
+#else
+       vperm   6,3,4,5
+#endif
+       addi    r4,r4,16
+       stvx    6,0,r11
+       addi    r11,r11,16
+       vor     3,4,4
+       clrrdi  0,r4,60
+
+L(setup_unaligned_loop):
+       mtctr   8
+       ble     cr6,L(end_unaligned_loop)
+
+       /* Copy 32 bytes at a time using vector instructions.  */
+       .align  4
+L(unaligned_loop):
+
+       /* Note: vr6/vr10 may contain data that was already copied,
+       but in order to get proper alignment, we may have to copy
+       some portions again. This is faster than having unaligned
+       vector instructions though.  */
+
+       lvx     4,r4,6
+#ifdef __LITTLE_ENDIAN__
+       vperm   6,4,3,5
+#else
+       vperm   6,3,4,5
+#endif
+       lvx     3,r4,7
+#ifdef __LITTLE_ENDIAN__
+       vperm   10,3,4,5
+#else
+       vperm   10,4,3,5
+#endif
+       addi    r4,r4,32
+       stvx    6,0,r11
+       stvx    10,r11,6
+       addi    r11,r11,32
+       bdnz    L(unaligned_loop)
+
+       clrrdi  0,r4,60
+
+       .align  4
+L(end_unaligned_loop):
+
+       /* Check for tail bytes.  */
+       mtocrf  0x01,r5
+       beqlr   cr1
+
+       add     r4,r4,0
+
+       /*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
+       /* Copy 8 bytes.  */
+       bf      28,4f
+       lwz     6,0(r4)
+       lwz     7,4(r4)
+       addi    r4,r4,8
+       stw     6,0(r11)
+       stw     7,4(r11)
+       addi    r11,r11,8
+4:     /* Copy 4~7 bytes.  */
+       bf      29,L(tail2)
+       lwz     6,0(r4)
+       stw     6,0(r11)
+       bf      30,L(tail5)
+       lhz     7,4(r4)
+       sth     7,4(r11)
+       bflr    31
+       lbz     8,6(r4)
+       stb     8,6(r11)
+       /* Return original DST pointer.  */
+       blr
+
+       /* Start to memcpy backward implementation: the algorith first check if
+          src and dest have the same alignment and if it does align both to 16
+          bytes and copy using VSX instructions.
+          If does not, align dest to 16 bytes and use VMX (altivec) instruction
+          to read two 16 bytes at time, shift/permute the bytes read and write
+          aligned to dest.  */
+L(memmove_bwd):
+       cmpldi  cr1,r5,31
+       /* Copy is done backwards: update the pointers and check alignment.  */
+       add     r11,r3,r5
+       add     r4,r4,r5
+       mr      r0,r11
+       ble     cr1, L(copy_LT_32_bwd)  /* If move < 32 bytes use short move
+                                          code.  */
+
+       andi.   r10,r11,15          /* Check if r11 is aligned to 16 bytes  */
+       clrldi  r9,r4,60            /* Check if r4 is aligned to 16 bytes  */
+       cmpld   cr6,r10,r9          /* SRC and DST alignments match?  */
+
+       bne     cr6,L(copy_GE_32_unaligned_bwd)
+       beq     L(aligned_copy_bwd)
+
+       mtocrf  0x01,r0
+       clrldi  r0,r0,60
+
+/* Get the DST and SRC aligned to 16 bytes.  */
+1:
+       bf      31,2f
+       lbz     r6,-1(r4)
+       subi    r4,r4,1
+       stb     r6,-1(r11)
+       subi    r11,r11,1
+2:
+       bf      30,4f
+       lhz     r6,-2(r4)
+       subi    r4,r4,2
+       sth     r6,-2(r11)
+       subi    r11,r11,2
+4:
+       bf      29,8f
+       lwz     r6,-4(r4)
+       subi    r4,r4,4
+       stw     r6,-4(r11)
+       subi    r11,r11,4
+8:
+       bf      28,16f
+       ld      r6,-8(r4)
+       subi    r4,r4,8
+       std     r6,-8(r11)
+       subi    r11,r11,8
+16:
+       subf    r5,0,r5
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy_bwd):
+       li      r6,-16
+       li      r7,-32
+       li      r8,-48
+       li      r9,-64
+       mtocrf  0x02,r5
+       srdi    r12,r5,7
+       cmpdi   r12,0
+       beq     L(aligned_tail_bwd)
+       lxvd2x  v6,r4,r6
+       lxvd2x  v7,r4,r7
+       mtctr   12
+       b       L(aligned_128loop_bwd)
+
+       .align  4
+L(aligned_128head_bwd):
+       /* for the 2nd + iteration of this loop. */
+       lxvd2x  v6,r4,r6
+       lxvd2x  v7,r4,r7
+L(aligned_128loop_bwd):
+       lxvd2x  v8,r4,r8
+       lxvd2x  v9,r4,r9
+       stxvd2x v6,r11,r6
+       subi    r4,r4,64
+       stxvd2x v7,r11,r7
+       stxvd2x v8,r11,r8
+       stxvd2x v9,r11,r9
+       lxvd2x  v6,r4,r6
+       lxvd2x  v7,r4,7
+       subi    r11,r11,64
+       lxvd2x  v8,r4,r8
+       lxvd2x  v9,r4,r9
+       subi    r4,r4,64
+       stxvd2x v6,r11,r6
+       stxvd2x v7,r11,r7
+       stxvd2x v8,r11,r8
+       stxvd2x v9,r11,r9
+       subi    r11,r11,64
+       bdnz    L(aligned_128head_bwd)
+
+L(aligned_tail_bwd):
+       mtocrf  0x01,r5
+       bf      25,32f
+       lxvd2x  v6,r4,r6
+       lxvd2x  v7,r4,r7
+       lxvd2x  v8,r4,r8
+       lxvd2x  v9,r4,r9
+       subi    r4,r4,64
+       stxvd2x v6,r11,r6
+       stxvd2x v7,r11,r7
+       stxvd2x v8,r11,r8
+       stxvd2x v9,r11,r9
+       subi    r11,r11,64
+32:
+       bf      26,16f
+       lxvd2x  v6,r4,r6
+       lxvd2x  v7,r4,r7
+       subi    r4,r4,32
+       stxvd2x v6,r11,r6
+       stxvd2x v7,r11,r7
+       subi    r11,r11,32
+16:
+       bf      27,8f
+       lxvd2x  v6,r4,r6
+       subi    r4,r4,16
+       stxvd2x v6,r11,r6
+       subi    r11,r11,16
+8:
+       bf      28,4f
+       ld      r6,-8(r4)
+       subi    r4,r4,8
+       std     r6,-8(r11)
+       subi    r11,r11,8
+4:     /* Copies 4~7 bytes.  */
+       bf      29,L(tail2_bwd)
+       lwz     r6,-4(r4)
+       stw     r6,-4(r11)
+       bf      30,L(tail5_bwd)
+       lhz     r7,-6(r4)
+       sth     r7,-6(r11)
+       bflr    31
+       lbz     r8,-7(r4)
+       stb     r8,-7(r11)
+       /* Return original DST pointer.  */
+       blr
+
+/* Handle copies of 0~31 bytes.  */
+       .align  4
+L(copy_LT_32_bwd):
+       cmpldi  cr6,r5,8
+       mtocrf  0x01,r5
+       ble     cr6,L(copy_LE_8_bwd)
+
+       /* At least 9 bytes to go.  */
+       neg     r8,r4
+       andi.   r0,r8,3
+       cmpldi  cr1,r5,16
+       beq     L(copy_LT_32_aligned_bwd)
+
+       /* Force 4-byte alignment for SRC.  */
+       mtocrf  0x01,0
+       subf    r5,0,r5
+2:
+       bf      30,1f
+       lhz     r6,-2(r4)
+       subi    r4,r4,2
+       sth     r6,-2(r11)
+       subi    r11,r11,2
+1:
+       bf      31,L(end_4bytes_alignment_bwd)
+       lbz     6,-1(r4)
+       subi    r4,r4,1
+       stb     6,-1(r11)
+       subi    r11,r11,1
+
+       .align  4
+L(end_4bytes_alignment_bwd):
+       cmpldi  cr1,r5,16
+       mtocrf  0x01,r5
+
+L(copy_LT_32_aligned_bwd):
+       /* At least 6 bytes to go, and SRC is word-aligned.  */
+       blt     cr1,8f
+
+       /* Copy 16 bytes.  */
+       lwz     r6,-4(r4)
+       lwz     r7,-8(r4)
+       stw     r6,-4(r11)
+       lwz     r8,-12(r4)
+       stw     r7,-8(r11)
+       lwz     r6,-16(r4)
+       subi    r4,r4,16
+       stw     r8,-12(r11)
+       stw     r6,-16(r11)
+       subi    r11,r11,16
+8:     /* Copy 8 bytes.  */
+       bf      28,L(tail4_bwd)
+       lwz     r6,-4(r4)
+       lwz     r7,-8(r4)
+       subi    r4,r4,8
+       stw     r6,-4(r11)
+       stw     r7,-8(r11)
+       subi    r11,r11,8
+
+       .align  4
+/* Copies 4~7 bytes.  */
+L(tail4_bwd):
+       bf      29,L(tail2_bwd)
+       lwz     6,-4(r4)
+       stw     6,-4(r11)
+       bf      30,L(tail5_bwd)
+       lhz     7,-6(r4)
+       sth     7,-6(r11)
+       bflr    31
+       lbz     8,-7(r4)
+       stb     8,-7(r11)
+       /* Return original DST pointer.  */
+       blr
+
+       .align  4
+/* Copies 2~3 bytes.  */
+L(tail2_bwd):
+       bf      30,1f
+       lhz     6,-2(r4)
+       sth     6,-2(r11)
+       bflr    31
+       lbz     7,-3(r4)
+       stb     7,-3(r11)
+       blr
+
+       .align  4
+L(tail5_bwd):
+       bflr    31
+       lbz     6,-5(r4)
+       stb     6,-5(r11)
+       blr
+
+       .align  4
+1:
+       bflr    31
+       lbz     6,-1(r4)
+       stb     6,-1(r11)
+       /* Return original DST pointer.  */
+       blr
+
+
+/* Handles copies of 0~8 bytes.  */
+       .align  4
+L(copy_LE_8_bwd):
+       bne     cr6,L(tail4_bwd)
+
+       /* Though we could've used ld/std here, they are still
+          slow for unaligned cases.  */
+       lwz     6,-8(r4)
+       lwz     7,-4(r4)
+       stw     6,-8(r11)
+       stw     7,-4(r11)
+       blr
+
+
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+   SRC is not. Use aligned quadword loads from SRC, shifted to realign
+   the data, allowing for aligned DST stores.  */
+       .align  4
+L(copy_GE_32_unaligned_bwd):
+       andi.   r10,r11,15      /* Check alignment of DST against 16 bytes..  */
+       srdi    r9,r5,4         /* Number of full quadwords remaining.  */
+
+       beq     L(copy_GE_32_unaligned_cont_bwd)
+
+       /* DST is not quadword aligned and r10 holds the address masked to
+           compare alignments.  */
+       mtocrf  0x01,r10
+       subf    r5,r10,r5
+
+       /* Vector instructions work best when proper alignment (16-bytes)
+       is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
+1:
+       bf      31,2f
+       lbz     r6,-1(r4)
+       subi    r4,r4,1
+       stb     r6,-1(r11)
+       subi    r11,r11,1
+2:
+       bf      30,4f
+       lhz     r6,-2(r4)
+       subi    r4,r4,2
+       sth     r6,-2(r11)
+       subi    r11,r11,2
+4:
+       bf      29,8f
+       lwz     r6,-4(r4)
+       subi    r4,r4,4
+       stw     r6,-4(r11)
+       subi    r11,r11,4
+8:
+       bf      28,0f
+       ld      r6,-8(r4)
+       subi    r4,r4,8
+       std     r6,-8(r11)
+       subi    r11,r11,8
+0:
+       srdi    r9,r5,4       /* Number of full quadwords remaining.  */
+
+       /* The proper alignment is present, it is OK to copy the bytes now.  */
+L(copy_GE_32_unaligned_cont_bwd):
+
+       /* Setup two indexes to speed up the indexed vector operations.  */
+       clrldi  r10,r5,60
+       li      r6,-16        /* Index for 16-bytes offsets.  */
+       li      r7,-32        /* Index for 32-bytes offsets.  */
+       cmpldi  cr1,10,0
+       srdi    r8,r5,5       /* Setup the loop counter.  */
+       mtocrf  0x01,9
+       cmpldi  cr6,r9,1
+#ifdef __LITTLE_ENDIAN__
+       lvsr    v5,r0,r4
+#else
+       lvsl    v5,r0,r4
+#endif
+       lvx     v3,0,r4
+       li      r0,0
+       bf      31,L(setup_unaligned_loop_bwd)
+
+       /* Copy another 16 bytes to align to 32-bytes due to the loop.  */
+       lvx     v4,r4,r6
+#ifdef __LITTLE_ENDIAN__
+       vperm   v6,v3,v4,v5
+#else
+       vperm   v6,v4,v3,v5
+#endif
+       subi    r4,r4,16
+       stvx    v6,r11,r6
+       subi    r11,r11,16
+       vor     v3,v4,v4
+       clrrdi  r0,r4,60
+
+L(setup_unaligned_loop_bwd):
+       mtctr   r8
+       ble     cr6,L(end_unaligned_loop_bwd)
+
+       /* Copy 32 bytes at a time using vector instructions.  */
+       .align  4
+L(unaligned_loop_bwd):
+
+       /* Note: vr6/vr10 may contain data that was already copied,
+       but in order to get proper alignment, we may have to copy
+       some portions again. This is faster than having unaligned
+       vector instructions though.  */
+
+       lvx     v4,r4,r6
+#ifdef __LITTLE_ENDIAN__
+       vperm   v6,v3,v4,v5
+#else
+       vperm   v6,v4,v3,v5
+#endif
+       lvx     v3,r4,r7
+#ifdef __LITTLE_ENDIAN__
+       vperm   v10,v4,v3,v5
+#else
+       vperm   v10,v3,v4,v5
+#endif
+       subi    r4,r4,32
+       stvx    v6,r11,r6
+       stvx    v10,r11,r7
+       subi    r11,r11,32
+       bdnz    L(unaligned_loop_bwd)
+
+       clrrdi  r0,r4,60
+
+       .align  4
+L(end_unaligned_loop_bwd):
+
+       /* Check for tail bytes.  */
+       mtocrf  0x01,r5
+       beqlr   cr1
+
+       add     r4,r4,0
+
+       /*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
+       /* Copy 8 bytes.  */
+       bf      28,4f
+       lwz     r6,-4(r4)
+       lwz     r7,-8(r4)
+       subi    r4,r4,8
+       stw     r6,-4(r11)
+       stw     r7,-8(r11)
+       subi    r11,r11,8
+4:     /* Copy 4~7 bytes.  */
+       bf      29,L(tail2_bwd)
+       lwz     r6,-4(r4)
+       stw     r6,-4(r11)
+       bf      30,L(tail5_bwd)
+       lhz     r7,-6(r4)
+       sth     r7,-6(r11)
+       bflr    31
+       lbz     r8,-7(r4)
+       stb     r8,-7(r11)
+       /* Return original DST pointer.  */
+       blr
+END_GEN_TB (memmove, TB_TOCLESS)
+libc_hidden_builtin_def (memmove)
+
+
+/* void bcopy(const void *src [r3], void *dest [r4], size_t n [r5])
+   Implemented in this file to avoid linker create a stub function call
+   in the branch to '_memmove'.  */
+ENTRY (bcopy)
+       mr      r6,r3
+       mr      r3,r4
+       mr      r4,r6
+       b       L(_memmove)
+END (bcopy)
author	Adhemerval Zanella <azanella@linux.vnet.ibm.com>
	Fri, 20 Jun 2014 17:55:16 +0000 (12:55 -0500)
committer	Adhemerval Zanella <azanella@linux.vnet.ibm.com>
	Mon, 7 Jul 2014 20:41:21 +0000 (15:41 -0500)
ChangeLog		patch \| blob \| history
string/bcopy.c		patch \| blob \| history
sysdeps/powerpc/powerpc64/multiarch/Makefile		patch \| blob \| history
sysdeps/powerpc/powerpc64/multiarch/bcopy-ppc64.c	[new file with mode: 0644]	patch \| blob
sysdeps/powerpc/powerpc64/multiarch/bcopy.c	[new file with mode: 0644]	patch \| blob
sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c		patch \| blob \| history
sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S	[new file with mode: 0644]	patch \| blob
sysdeps/powerpc/powerpc64/multiarch/memmove-ppc64.c	[new file with mode: 0644]	patch \| blob
sysdeps/powerpc/powerpc64/multiarch/memmove.c	[new file with mode: 0644]	patch \| blob
sysdeps/powerpc/powerpc64/power7/bcopy.c	[new file with mode: 0644]	patch \| blob
sysdeps/powerpc/powerpc64/power7/memmove.S	[new file with mode: 0644]	patch \| blob