power7 memcpy VSX optimizations

author Will Schmidt <will_schmidt@vnet.ibm.com>

Thu, 8 Sep 2011 01:54:41 +0000 (21:54 -0400)

committer Ulrich Drepper <drepper@gmail.com>

Thu, 8 Sep 2011 01:54:41 +0000 (21:54 -0400)
author Will Schmidt <will_schmidt@vnet.ibm.com>
Thu, 8 Sep 2011 01:54:41 +0000 (21:54 -0400)
committer Ulrich Drepper <drepper@gmail.com>
Thu, 8 Sep 2011 01:54:41 +0000 (21:54 -0400)
diff --git a/ChangeLog b/ChangeLog

index c90f2c7..429767d 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2011-07-28  Will Schmidt  <will_schmidt@vnet.ibm.com>
+
+       * sysdeps/powerpc/powerpc32/power7/memcpy.S: Optimize the
+       aligned copy for power7 with vector-scalar instructions.
+       * sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise.
+
  2011-07-24  H.J. Lu  <hongjiu.lu@intel.com>
  
         * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Simplify
diff --git a/sysdeps/powerpc/powerpc32/power7/memcpy.S b/sysdeps/powerpc/powerpc32/power7/memcpy.S

index f0c332f..ec70557 100644 (file)
--- a/sysdeps/powerpc/powerpc32/power7/memcpy.S
+++ b/sysdeps/powerpc/powerpc32/power7/memcpy.S
@@ -1,5 +1,5 @@
  /* Optimized memcpy implementation for PowerPC32/POWER7.
-   Copyright (C) 2010 Free Software Foundation, Inc.
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
     Contributed by Luis Machado <luisgpm@br.ibm.com>.
     This file is part of the GNU C Library.
  
@@ -116,24 +116,82 @@ L(copy_GE_32_aligned_cont):
         stfd    6,0(3)
         addi    10,3,8
  
+L(aligned_copy):
+       /* Main aligned copy loop. Copies up to 128-bytes at a time. */
         .align  4
-4:     /* Main aligned copy loop. Copies 32-bytes at a time.  */
-       lfd     6,0(11)
-       lfd     7,8(11)
-       lfd     8,16(11)
-       lfd     0,24(11)
-       addi    11,11,32
+4:
+       /* check for any 32-byte or 64-byte lumps that are outside of a
+          nice 128-byte range.  R8 contains the number of 32-byte
+          lumps, so drop this into the CR, and use the SO/EQ bits to help
+          handle the 32- or 64- byte lumps.  Then handle the rest with an
+          unrolled 128-bytes-at-a-time copy loop. */
+       mtocrf  1,8
+       li      6,16    # 16() index
+       li      7,32    # 32() index
+       li      8,48    # 48() index
+
+L(aligned_32byte):
+       /* if the SO bit (indicating a 32-byte lump) is not set, move along. */
+       bns     cr7,L(aligned_64byte)
+       lxvd2x  6,0,11
+       lxvd2x  7,11,6
+       addi    11,11,32
+       stxvd2x 6,0,10
+       stxvd2x 7,10,6
+       addi    10,10,32
+
+L(aligned_64byte):
+       /* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
+       bne     cr7,L(aligned_128setup)
+       lxvd2x  6,0,11
+       lxvd2x  7,11,6
+       lxvd2x  8,11,7
+       lxvd2x  9,11,8
+       addi    11,11,64
+       stxvd2x 6,0,10
+       stxvd2x 7,10,6
+       stxvd2x 8,10,7
+       stxvd2x 9,10,8
+       addi    10,10,64
+
+L(aligned_128setup):
+       /* Set up for the 128-byte at a time copy loop.  */
+       srwi    8,31,7
+       cmpwi   8,0     # Any 4x lumps left?
+       beq     3f      # if not, move along.
+       lxvd2x  6,0,11
+       lxvd2x  7,11,6
+       mtctr   8       # otherwise, load the ctr and begin.
+       li      8,48    # 48() index
+       b       L(aligned_128loop)
+
+L(aligned_128head):
+       /* for the 2nd + iteration of this loop. */
+       lxvd2x  6,0,11
+       lxvd2x  7,11,6
+L(aligned_128loop):
+       lxvd2x  8,11,7
+       lxvd2x  9,11,8
+       stxvd2x 6,0,10
+       addi    11,11,64
+       stxvd2x 7,10,6
+       stxvd2x 8,10,7
+       stxvd2x 9,10,8
+       lxvd2x  6,0,11
+       lxvd2x  7,11,6
+       addi    10,10,64
+       lxvd2x  8,11,7
+       lxvd2x  9,11,8
+       addi    11,11,64
+       stxvd2x 6,0,10
+       stxvd2x 7,10,6
+       stxvd2x 8,10,7
+       stxvd2x 9,10,8
+       addi    10,10,64
+       bdnz    L(aligned_128head)
  
-       stfd    6,0(10)
-       stfd    7,8(10)
-       stfd    8,16(10)
-       stfd    0,24(10)
-       addi    10,10,32
-       bdnz    4b
  3:
-
         /* Check for tail bytes.  */
-
         clrrwi  0,31,3
         mtcrf   0x01,31
         beq     cr6,0f
diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S

index 2e5beed..8aaef97 100644 (file)
--- a/sysdeps/powerpc/powerpc64/power7/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S
@@ -1,5 +1,5 @@
  /* Optimized memcpy implementation for PowerPC64/POWER7.
-   Copyright (C) 2010 Free Software Foundation, Inc.
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
     Contributed by Luis Machado <luisgpm@br.ibm.com>.
     This file is part of the GNU C Library.
  
@@ -115,23 +115,81 @@ L(copy_GE_32_aligned_cont):
         std     6,0(3)
         addi    10,3,8
  
-       /* Main aligned copy loop. Copies 32-bytes at a time.  */
+L(aligned_copy):
+       /* Main aligned copy loop. Copies up to 128-bytes at a time. */
         .align  4
  4:
-       ld      6,0(11)
-       ld      7,8(11)
-       ld      8,16(11)
-       ld      0,24(11)
-       addi    11,11,32
+       /* check for any 32-byte or 64-byte lumps that are outside of a
+          nice 128-byte range.  R8 contains the number of 32-byte
+          lumps, so drop this into the CR, and use the SO/EQ bits to help
+          handle the 32- or 64- byte lumps.  Then handle the rest with an
+          unrolled 128-bytes-at-a-time copy loop. */
+       mtocrf  1,8
+       li      6,16    # 16() index
+       li      7,32    # 32() index
+       li      8,48    # 48() index
+
+L(aligned_32byte):
+       /* if the SO bit (indicating a 32-byte lump) is not set, move along. */
+       bns     cr7,L(aligned_64byte)
+       lxvd2x  6,0,11
+       lxvd2x  7,11,6
+       addi    11,11,32
+       stxvd2x 6,0,10
+       stxvd2x 7,10,6
+       addi    10,10,32
+
+L(aligned_64byte):
+       /* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
+       bne     cr7,L(aligned_128setup)
+       lxvd2x  6,0,11
+       lxvd2x  7,11,6
+       lxvd2x  8,11,7
+       lxvd2x  9,11,8
+       addi    11,11,64
+       stxvd2x 6,0,10
+       stxvd2x 7,10,6
+       stxvd2x 8,10,7
+       stxvd2x 9,10,8
+       addi    10,10,64
+
+L(aligned_128setup):
+       /* Set up for the 128-byte at a time copy loop.  */
+       srdi    8,31,7
+       cmpdi   8,0     # Any 4x lumps left?
+       beq     3f      # if not, move along.
+       lxvd2x  6,0,11
+       lxvd2x  7,11,6
+       mtctr   8       # otherwise, load the ctr and begin.
+       li      8,48    # 48() index
+       b       L(aligned_128loop)
+
+L(aligned_128head):
+       /* for the 2nd + iteration of this loop. */
+       lxvd2x  6,0,11
+       lxvd2x  7,11,6
+L(aligned_128loop):
+       lxvd2x  8,11,7
+       lxvd2x  9,11,8
+       stxvd2x 6,0,10
+       addi    11,11,64
+       stxvd2x 7,10,6
+       stxvd2x 8,10,7
+       stxvd2x 9,10,8
+       lxvd2x  6,0,11
+       lxvd2x  7,11,6
+       addi    10,10,64
+       lxvd2x  8,11,7
+       lxvd2x  9,11,8
+       addi    11,11,64
+       stxvd2x 6,0,10
+       stxvd2x 7,10,6
+       stxvd2x 8,10,7
+       stxvd2x 9,10,8
+       addi    10,10,64
+       bdnz    L(aligned_128head)
  
-       std     6,0(10)
-       std     7,8(10)
-       std     8,16(10)
-       std     0,24(10)
-       addi    10,10,32
-       bdnz    4b
  3:
-
         /* Check for tail bytes.  */
         rldicr  0,31,0,60
         mtcrf   0x01,31
author	Will Schmidt <will_schmidt@vnet.ibm.com>
	Thu, 8 Sep 2011 01:54:41 +0000 (21:54 -0400)
committer	Ulrich Drepper <drepper@gmail.com>
	Thu, 8 Sep 2011 01:54:41 +0000 (21:54 -0400)
ChangeLog		patch \| blob \| history
sysdeps/powerpc/powerpc32/power7/memcpy.S		patch \| blob \| history
sysdeps/powerpc/powerpc64/power7/memcpy.S		patch \| blob \| history