From f230c29b40cc36ce62387664be92c3cf94119efe Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 31 May 2012 14:19:30 -0700
Subject: [PATCH] Avoid performance penalty in sparc optimized memcpy/memset.

fmovd clears the current exception field in the %fsr, fsrc2
does not and therefore runs more efficiently on some cpus.

	* sysdeps/sparc/sparc64/memcpy.S: Use fsrc2 to move 64-bit
	values between float registers.
	* sysdeps/sparc/sparc64/memset.S: Likewise.
	* sysdeps/sparc/sparc64/multiarch/memcpy-niagara2.S: Likewise.
---
 ChangeLog                                         |  7 +++
 sysdeps/sparc/sparc64/memcpy.S                    |  8 +--
 sysdeps/sparc/sparc64/memset.S                    | 14 ++---
 sysdeps/sparc/sparc64/multiarch/memcpy-niagara2.S | 72 +++++++++++------------
 4 files changed, 54 insertions(+), 47 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 4d71f26..b0a4689 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2012-05-31  David S. Miller  <davem@davemloft.net>
+
+	* sysdeps/sparc/sparc64/memcpy.S: Use fsrc2 to move 64-bit
+	values between float registers.
+	* sysdeps/sparc/sparc64/memset.S: Likewise.
+	* sysdeps/sparc/sparc64/multiarch/memcpy-niagara2.S: Likewise.
+
 2012-05-31  Mike Frysinger  <vapier@gentoo.org>
 
 	* debug/Makefile (CFLAGS-tst-longjmp_chk.c): Delete
diff --git a/sysdeps/sparc/sparc64/memcpy.S b/sysdeps/sparc/sparc64/memcpy.S
index 668ebec..8371088 100644
--- a/sysdeps/sparc/sparc64/memcpy.S
+++ b/sysdeps/sparc/sparc64/memcpy.S
@@ -79,7 +79,7 @@
 #define UNEVEN_VISCHUNK(dest, f0, f1, left)			\
 	subcc		%left, 8, %left;			\
 	bl,pn		%xcc, 205f;				\
-	 fsrc1		%f0, %f1;				\
+	 fsrc2		%f0, %f1;				\
 	ba,a,pt		%xcc, 204f;
 
 	/* Macros for non-VIS memcpy code. */
@@ -162,7 +162,7 @@ ENTRY(__memcpy_large)
 3:	andcc		%o0, 0x38, %g5			/* IEU1		Group		*/
 201:	be,pt		%icc, 202f			/* CTI				*/
 	 mov		64, %g1				/* IEU0				*/
-	fmovd		%f0, %f2			/* FPU				*/
+	fsrc2		%f0, %f2			/* FPU				*/
 	sub		%g1, %g5, %g5			/* IEU0		Group		*/
 	alignaddr	%o1, %g0, %g1			/* GRU		Group		*/
 	ldd		[%g1], %f4			/* Load		Group		*/
@@ -193,7 +193,7 @@ ENTRY(__memcpy_large)
 	andn		%o1, (0x40 - 1), %o1		/* IEU1				*/
 	and		%g2, 7, %g2			/* IEU0		Group		*/
 	andncc		%g3, 0x7, %g3			/* IEU1				*/
-	fmovd		%f0, %f2			/* FPU				*/
+	fsrc2		%f0, %f2			/* FPU				*/
 	sub		%g3, 0x10, %g3			/* IEU0		Group		*/
 	sub		%o2, %g6, %o2			/* IEU1				*/
 	alignaddr	%g1, %g0, %g0			/* GRU		Group		*/
@@ -541,7 +541,7 @@ ENTRY(memcpy)
 	 stb		%g5, [%o0 - 1]			/* Store			*/
 2:	andn		%o2, 7, %g5 			/* IEU0		Group		*/
 	and		%o2, 7, %o2			/* IEU1				*/
-	fmovd		%f0, %f2			/* FPU				*/
+	fsrc2		%f0, %f2			/* FPU				*/
 	alignaddr	%o1, %g0, %g1			/* GRU		Group		*/
 	ldd		[%g1], %f4			/* Load		Group		*/
 1:	ldd		[%g1 + 0x8], %f6		/* Load		Group		*/
diff --git a/sysdeps/sparc/sparc64/memset.S b/sysdeps/sparc/sparc64/memset.S
index b9c52aa..5e92936 100644
--- a/sysdeps/sparc/sparc64/memset.S
+++ b/sysdeps/sparc/sparc64/memset.S
@@ -109,16 +109,16 @@ ENTRY(memset)
 	membar		#StoreStore | #LoadStore
 	andcc		%o3, 0xc0, %g5
 	and		%o2, 0x3f, %o2
-	fmovd		%f0, %f2
-	fmovd		%f0, %f4
+	fsrc2		%f0, %f2
+	fsrc2		%f0, %f4
 	andn		%o3, 0xff, %o3
-	fmovd		%f0, %f6
+	fsrc2		%f0, %f6
 	cmp		%g5, 64
-	fmovd		%f0, %f8
-	fmovd		%f0, %f10
-	fmovd		%f0, %f12
+	fsrc2		%f0, %f8
+	fsrc2		%f0, %f10
+	fsrc2		%f0, %f12
 	brz,pn		%g5, 10f
-	 fmovd		%f0, %f14
+	 fsrc2		%f0, %f14
 	be,pn		%icc, 2f
 	 stda		%f0, [%o0 + 0x00] %asi
 	cmp		%g5, 128
diff --git a/sysdeps/sparc/sparc64/multiarch/memcpy-niagara2.S b/sysdeps/sparc/sparc64/multiarch/memcpy-niagara2.S
index 0e9442d..fb815e5 100644
--- a/sysdeps/sparc/sparc64/multiarch/memcpy-niagara2.S
+++ b/sysdeps/sparc/sparc64/multiarch/memcpy-niagara2.S
@@ -58,49 +58,49 @@
 	faligndata	%x7, %x8, %f14;
 
 #define FREG_MOVE_1(x0) \
-	fmovd		%x0, %f0;
+	fsrc2		%x0, %f0;
 #define FREG_MOVE_2(x0, x1) \
-	fmovd		%x0, %f0; \
-	fmovd		%x1, %f2;
+	fsrc2		%x0, %f0; \
+	fsrc2		%x1, %f2;
 #define FREG_MOVE_3(x0, x1, x2) \
-	fmovd		%x0, %f0; \
-	fmovd		%x1, %f2; \
-	fmovd		%x2, %f4;
+	fsrc2		%x0, %f0; \
+	fsrc2		%x1, %f2; \
+	fsrc2		%x2, %f4;
 #define FREG_MOVE_4(x0, x1, x2, x3) \
-	fmovd		%x0, %f0; \
-	fmovd		%x1, %f2; \
-	fmovd		%x2, %f4; \
-	fmovd		%x3, %f6;
+	fsrc2		%x0, %f0; \
+	fsrc2		%x1, %f2; \
+	fsrc2		%x2, %f4; \
+	fsrc2		%x3, %f6;
 #define FREG_MOVE_5(x0, x1, x2, x3, x4) \
-	fmovd		%x0, %f0; \
-	fmovd		%x1, %f2; \
-	fmovd		%x2, %f4; \
-	fmovd		%x3, %f6; \
-	fmovd		%x4, %f8;
+	fsrc2		%x0, %f0; \
+	fsrc2		%x1, %f2; \
+	fsrc2		%x2, %f4; \
+	fsrc2		%x3, %f6; \
+	fsrc2		%x4, %f8;
 #define FREG_MOVE_6(x0, x1, x2, x3, x4, x5) \
-	fmovd		%x0, %f0; \
-	fmovd		%x1, %f2; \
-	fmovd		%x2, %f4; \
-	fmovd		%x3, %f6; \
-	fmovd		%x4, %f8; \
-	fmovd		%x5, %f10;
+	fsrc2		%x0, %f0; \
+	fsrc2		%x1, %f2; \
+	fsrc2		%x2, %f4; \
+	fsrc2		%x3, %f6; \
+	fsrc2		%x4, %f8; \
+	fsrc2		%x5, %f10;
 #define FREG_MOVE_7(x0, x1, x2, x3, x4, x5, x6) \
-	fmovd		%x0, %f0; \
-	fmovd		%x1, %f2; \
-	fmovd		%x2, %f4; \
-	fmovd		%x3, %f6; \
-	fmovd		%x4, %f8; \
-	fmovd		%x5, %f10; \
-	fmovd		%x6, %f12;
+	fsrc2		%x0, %f0; \
+	fsrc2		%x1, %f2; \
+	fsrc2		%x2, %f4; \
+	fsrc2		%x3, %f6; \
+	fsrc2		%x4, %f8; \
+	fsrc2		%x5, %f10; \
+	fsrc2		%x6, %f12;
 #define FREG_MOVE_8(x0, x1, x2, x3, x4, x5, x6, x7) \
-	fmovd		%x0, %f0; \
-	fmovd		%x1, %f2; \
-	fmovd		%x2, %f4; \
-	fmovd		%x3, %f6; \
-	fmovd		%x4, %f8; \
-	fmovd		%x5, %f10; \
-	fmovd		%x6, %f12; \
-	fmovd		%x7, %f14;
+	fsrc2		%x0, %f0; \
+	fsrc2		%x1, %f2; \
+	fsrc2		%x2, %f4; \
+	fsrc2		%x3, %f6; \
+	fsrc2		%x4, %f8; \
+	fsrc2		%x5, %f10; \
+	fsrc2		%x6, %f12; \
+	fsrc2		%x7, %f14;
 #define FREG_LOAD_1(base, x0) \
 	LOAD(ldd, base + 0x00, %x0)
 #define FREG_LOAD_2(base, x0, x1) \
-- 
2.7.4