From 5025581e1c66a184a587ab1bd99cd168e8fb7770 Mon Sep 17 00:00:00 2001
From: Will Schmidt <will_schmidt@vnet.ibm.com>
Date: Wed, 7 Sep 2011 21:54:41 -0400
Subject: [PATCH] power7 memcpy VSX optimizations

---
 ChangeLog                                 |  6 +++
 sysdeps/powerpc/powerpc32/power7/memcpy.S | 88 +++++++++++++++++++++++++------
 sysdeps/powerpc/powerpc64/power7/memcpy.S | 86 +++++++++++++++++++++++++-----
 3 files changed, 151 insertions(+), 29 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index c90f2c7..429767d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2011-07-28  Will Schmidt  <will_schmidt@vnet.ibm.com>
+
+	* sysdeps/powerpc/powerpc32/power7/memcpy.S: Optimize the
+	aligned copy for power7 with vector-scalar instructions.
+	* sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise.
+
 2011-07-24  H.J. Lu  <hongjiu.lu@intel.com>
 
 	* sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Simplify
diff --git a/sysdeps/powerpc/powerpc32/power7/memcpy.S b/sysdeps/powerpc/powerpc32/power7/memcpy.S
index f0c332f..ec70557 100644
--- a/sysdeps/powerpc/powerpc32/power7/memcpy.S
+++ b/sysdeps/powerpc/powerpc32/power7/memcpy.S
@@ -1,5 +1,5 @@
 /* Optimized memcpy implementation for PowerPC32/POWER7.
-   Copyright (C) 2010 Free Software Foundation, Inc.
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
    Contributed by Luis Machado <luisgpm@br.ibm.com>.
    This file is part of the GNU C Library.
 
@@ -116,24 +116,82 @@ L(copy_GE_32_aligned_cont):
 	stfd    6,0(3)
 	addi    10,3,8
 
+L(aligned_copy):
+	/* Main aligned copy loop. Copies up to 128-bytes at a time. */
 	.align  4
-4:	/* Main aligned copy loop. Copies 32-bytes at a time.  */
-	lfd	6,0(11)
-	lfd     7,8(11)
-	lfd     8,16(11)
-	lfd     0,24(11)
-	addi    11,11,32
+4:
+	/* check for any 32-byte or 64-byte lumps that are outside of a
+	   nice 128-byte range.  R8 contains the number of 32-byte
+	   lumps, so drop this into the CR, and use the SO/EQ bits to help
+	   handle the 32- or 64- byte lumps.  Then handle the rest with an
+	   unrolled 128-bytes-at-a-time copy loop. */
+	mtocrf	1,8
+	li	6,16	# 16() index
+	li	7,32	# 32() index
+	li	8,48	# 48() index
+
+L(aligned_32byte):
+	/* if the SO bit (indicating a 32-byte lump) is not set, move along. */
+	bns	cr7,L(aligned_64byte)
+	lxvd2x	6,0,11
+	lxvd2x	7,11,6
+	addi	11,11,32
+	stxvd2x	6,0,10
+	stxvd2x	7,10,6
+	addi	10,10,32
+
+L(aligned_64byte):
+	/* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
+	bne	cr7,L(aligned_128setup)
+	lxvd2x	6,0,11
+	lxvd2x	7,11,6
+	lxvd2x	8,11,7
+	lxvd2x	9,11,8
+	addi	11,11,64
+	stxvd2x	6,0,10
+	stxvd2x	7,10,6
+	stxvd2x	8,10,7
+	stxvd2x	9,10,8
+	addi	10,10,64
+
+L(aligned_128setup):
+	/* Set up for the 128-byte at a time copy loop.  */
+	srwi	8,31,7
+	cmpwi	8,0	# Any 4x lumps left?
+	beq	3f	# if not, move along.
+	lxvd2x	6,0,11
+	lxvd2x	7,11,6
+	mtctr	8	# otherwise, load the ctr and begin.
+	li	8,48	# 48() index
+	b	L(aligned_128loop)
+
+L(aligned_128head):
+	/* for the 2nd + iteration of this loop. */
+	lxvd2x	6,0,11
+	lxvd2x	7,11,6
+L(aligned_128loop):
+	lxvd2x	8,11,7
+	lxvd2x	9,11,8
+	stxvd2x	6,0,10
+	addi	11,11,64
+	stxvd2x	7,10,6
+	stxvd2x	8,10,7
+	stxvd2x	9,10,8
+	lxvd2x	6,0,11
+	lxvd2x	7,11,6
+	addi	10,10,64
+	lxvd2x	8,11,7
+	lxvd2x	9,11,8
+	addi	11,11,64
+	stxvd2x	6,0,10
+	stxvd2x	7,10,6
+	stxvd2x	8,10,7
+	stxvd2x	9,10,8
+	addi	10,10,64
+	bdnz	L(aligned_128head)
 
-	stfd    6,0(10)
-	stfd    7,8(10)
-	stfd    8,16(10)
-	stfd    0,24(10)
-	addi    10,10,32
-	bdnz    4b
 3:
-
 	/* Check for tail bytes.  */
-
 	clrrwi  0,31,3
 	mtcrf   0x01,31
 	beq	cr6,0f
diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S
index 2e5beed..8aaef97 100644
--- a/sysdeps/powerpc/powerpc64/power7/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S
@@ -1,5 +1,5 @@
 /* Optimized memcpy implementation for PowerPC64/POWER7.
-   Copyright (C) 2010 Free Software Foundation, Inc.
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
    Contributed by Luis Machado <luisgpm@br.ibm.com>.
    This file is part of the GNU C Library.
 
@@ -115,23 +115,81 @@ L(copy_GE_32_aligned_cont):
 	std     6,0(3)
 	addi    10,3,8
 
-	/* Main aligned copy loop. Copies 32-bytes at a time.  */
+L(aligned_copy):
+	/* Main aligned copy loop. Copies up to 128-bytes at a time. */
 	.align  4
 4:
-	ld	6,0(11)
-	ld      7,8(11)
-	ld      8,16(11)
-	ld      0,24(11)
-	addi    11,11,32
+	/* check for any 32-byte or 64-byte lumps that are outside of a
+	   nice 128-byte range.  R8 contains the number of 32-byte
+	   lumps, so drop this into the CR, and use the SO/EQ bits to help
+	   handle the 32- or 64- byte lumps.  Then handle the rest with an
+	   unrolled 128-bytes-at-a-time copy loop. */
+	mtocrf	1,8
+	li	6,16	# 16() index
+	li	7,32	# 32() index
+	li	8,48	# 48() index
+
+L(aligned_32byte):
+	/* if the SO bit (indicating a 32-byte lump) is not set, move along. */
+	bns	cr7,L(aligned_64byte)
+	lxvd2x	6,0,11
+	lxvd2x	7,11,6
+	addi	11,11,32
+	stxvd2x	6,0,10
+	stxvd2x	7,10,6
+	addi	10,10,32
+
+L(aligned_64byte):
+	/* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
+	bne	cr7,L(aligned_128setup)
+	lxvd2x	6,0,11
+	lxvd2x	7,11,6
+	lxvd2x	8,11,7
+	lxvd2x	9,11,8
+	addi	11,11,64
+	stxvd2x	6,0,10
+	stxvd2x	7,10,6
+	stxvd2x	8,10,7
+	stxvd2x	9,10,8
+	addi	10,10,64
+
+L(aligned_128setup):
+	/* Set up for the 128-byte at a time copy loop.  */
+	srdi	8,31,7
+	cmpdi	8,0	# Any 4x lumps left?
+	beq	3f	# if not, move along.
+	lxvd2x	6,0,11
+	lxvd2x	7,11,6
+	mtctr	8	# otherwise, load the ctr and begin.
+	li	8,48	# 48() index
+	b	L(aligned_128loop)
+
+L(aligned_128head):
+	/* for the 2nd + iteration of this loop. */
+	lxvd2x	6,0,11
+	lxvd2x	7,11,6
+L(aligned_128loop):
+	lxvd2x	8,11,7
+	lxvd2x	9,11,8
+	stxvd2x	6,0,10
+	addi	11,11,64
+	stxvd2x	7,10,6
+	stxvd2x	8,10,7
+	stxvd2x	9,10,8
+	lxvd2x	6,0,11
+	lxvd2x	7,11,6
+	addi	10,10,64
+	lxvd2x	8,11,7
+	lxvd2x	9,11,8
+	addi	11,11,64
+	stxvd2x	6,0,10
+	stxvd2x	7,10,6
+	stxvd2x	8,10,7
+	stxvd2x	9,10,8
+	addi	10,10,64
+	bdnz	L(aligned_128head)
 
-	std     6,0(10)
-	std     7,8(10)
-	std     8,16(10)
-	std     0,24(10)
-	addi    10,10,32
-	bdnz    4b
 3:
-
 	/* Check for tail bytes.  */
 	rldicr  0,31,0,60
 	mtcrf   0x01,31
-- 
2.7.4