From f230c29b40cc36ce62387664be92c3cf94119efe Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 31 May 2012 14:19:30 -0700 Subject: [PATCH] Avoid performance penalty in sparc optimized memcpy/memset. fmovd clears the current exception field in the %fsr, fsrc2 does not and therefore runs more efficiently on some cpus. * sysdeps/sparc/sparc64/memcpy.S: Use fsrc2 to move 64-bit values between float registers. * sysdeps/sparc/sparc64/memset.S: Likewise. * sysdeps/sparc/sparc64/multiarch/memcpy-niagara2.S: Likewise. --- ChangeLog | 7 +++ sysdeps/sparc/sparc64/memcpy.S | 8 +-- sysdeps/sparc/sparc64/memset.S | 14 ++--- sysdeps/sparc/sparc64/multiarch/memcpy-niagara2.S | 72 +++++++++++------------ 4 files changed, 54 insertions(+), 47 deletions(-) diff --git a/ChangeLog b/ChangeLog index 4d71f26..b0a4689 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +2012-05-31 David S. Miller + + * sysdeps/sparc/sparc64/memcpy.S: Use fsrc2 to move 64-bit + values between float registers. + * sysdeps/sparc/sparc64/memset.S: Likewise. + * sysdeps/sparc/sparc64/multiarch/memcpy-niagara2.S: Likewise. + 2012-05-31 Mike Frysinger * debug/Makefile (CFLAGS-tst-longjmp_chk.c): Delete diff --git a/sysdeps/sparc/sparc64/memcpy.S b/sysdeps/sparc/sparc64/memcpy.S index 668ebec..8371088 100644 --- a/sysdeps/sparc/sparc64/memcpy.S +++ b/sysdeps/sparc/sparc64/memcpy.S @@ -79,7 +79,7 @@ #define UNEVEN_VISCHUNK(dest, f0, f1, left) \ subcc %left, 8, %left; \ bl,pn %xcc, 205f; \ - fsrc1 %f0, %f1; \ + fsrc2 %f0, %f1; \ ba,a,pt %xcc, 204f; /* Macros for non-VIS memcpy code. */ @@ -162,7 +162,7 @@ ENTRY(__memcpy_large) 3: andcc %o0, 0x38, %g5 /* IEU1 Group */ 201: be,pt %icc, 202f /* CTI */ mov 64, %g1 /* IEU0 */ - fmovd %f0, %f2 /* FPU */ + fsrc2 %f0, %f2 /* FPU */ sub %g1, %g5, %g5 /* IEU0 Group */ alignaddr %o1, %g0, %g1 /* GRU Group */ ldd [%g1], %f4 /* Load Group */ @@ -193,7 +193,7 @@ ENTRY(__memcpy_large) andn %o1, (0x40 - 1), %o1 /* IEU1 */ and %g2, 7, %g2 /* IEU0 Group */ andncc %g3, 0x7, %g3 /* IEU1 */ - fmovd %f0, %f2 /* FPU */ + fsrc2 %f0, %f2 /* FPU */ sub %g3, 0x10, %g3 /* IEU0 Group */ sub %o2, %g6, %o2 /* IEU1 */ alignaddr %g1, %g0, %g0 /* GRU Group */ @@ -541,7 +541,7 @@ ENTRY(memcpy) stb %g5, [%o0 - 1] /* Store */ 2: andn %o2, 7, %g5 /* IEU0 Group */ and %o2, 7, %o2 /* IEU1 */ - fmovd %f0, %f2 /* FPU */ + fsrc2 %f0, %f2 /* FPU */ alignaddr %o1, %g0, %g1 /* GRU Group */ ldd [%g1], %f4 /* Load Group */ 1: ldd [%g1 + 0x8], %f6 /* Load Group */ diff --git a/sysdeps/sparc/sparc64/memset.S b/sysdeps/sparc/sparc64/memset.S index b9c52aa..5e92936 100644 --- a/sysdeps/sparc/sparc64/memset.S +++ b/sysdeps/sparc/sparc64/memset.S @@ -109,16 +109,16 @@ ENTRY(memset) membar #StoreStore | #LoadStore andcc %o3, 0xc0, %g5 and %o2, 0x3f, %o2 - fmovd %f0, %f2 - fmovd %f0, %f4 + fsrc2 %f0, %f2 + fsrc2 %f0, %f4 andn %o3, 0xff, %o3 - fmovd %f0, %f6 + fsrc2 %f0, %f6 cmp %g5, 64 - fmovd %f0, %f8 - fmovd %f0, %f10 - fmovd %f0, %f12 + fsrc2 %f0, %f8 + fsrc2 %f0, %f10 + fsrc2 %f0, %f12 brz,pn %g5, 10f - fmovd %f0, %f14 + fsrc2 %f0, %f14 be,pn %icc, 2f stda %f0, [%o0 + 0x00] %asi cmp %g5, 128 diff --git a/sysdeps/sparc/sparc64/multiarch/memcpy-niagara2.S b/sysdeps/sparc/sparc64/multiarch/memcpy-niagara2.S index 0e9442d..fb815e5 100644 --- a/sysdeps/sparc/sparc64/multiarch/memcpy-niagara2.S +++ b/sysdeps/sparc/sparc64/multiarch/memcpy-niagara2.S @@ -58,49 +58,49 @@ faligndata %x7, %x8, %f14; #define FREG_MOVE_1(x0) \ - fmovd %x0, %f0; + fsrc2 %x0, %f0; #define FREG_MOVE_2(x0, x1) \ - fmovd %x0, %f0; \ - fmovd %x1, %f2; + fsrc2 %x0, %f0; \ + fsrc2 %x1, %f2; #define FREG_MOVE_3(x0, x1, x2) \ - fmovd %x0, %f0; \ - fmovd %x1, %f2; \ - fmovd %x2, %f4; + fsrc2 %x0, %f0; \ + fsrc2 %x1, %f2; \ + fsrc2 %x2, %f4; #define FREG_MOVE_4(x0, x1, x2, x3) \ - fmovd %x0, %f0; \ - fmovd %x1, %f2; \ - fmovd %x2, %f4; \ - fmovd %x3, %f6; + fsrc2 %x0, %f0; \ + fsrc2 %x1, %f2; \ + fsrc2 %x2, %f4; \ + fsrc2 %x3, %f6; #define FREG_MOVE_5(x0, x1, x2, x3, x4) \ - fmovd %x0, %f0; \ - fmovd %x1, %f2; \ - fmovd %x2, %f4; \ - fmovd %x3, %f6; \ - fmovd %x4, %f8; + fsrc2 %x0, %f0; \ + fsrc2 %x1, %f2; \ + fsrc2 %x2, %f4; \ + fsrc2 %x3, %f6; \ + fsrc2 %x4, %f8; #define FREG_MOVE_6(x0, x1, x2, x3, x4, x5) \ - fmovd %x0, %f0; \ - fmovd %x1, %f2; \ - fmovd %x2, %f4; \ - fmovd %x3, %f6; \ - fmovd %x4, %f8; \ - fmovd %x5, %f10; + fsrc2 %x0, %f0; \ + fsrc2 %x1, %f2; \ + fsrc2 %x2, %f4; \ + fsrc2 %x3, %f6; \ + fsrc2 %x4, %f8; \ + fsrc2 %x5, %f10; #define FREG_MOVE_7(x0, x1, x2, x3, x4, x5, x6) \ - fmovd %x0, %f0; \ - fmovd %x1, %f2; \ - fmovd %x2, %f4; \ - fmovd %x3, %f6; \ - fmovd %x4, %f8; \ - fmovd %x5, %f10; \ - fmovd %x6, %f12; + fsrc2 %x0, %f0; \ + fsrc2 %x1, %f2; \ + fsrc2 %x2, %f4; \ + fsrc2 %x3, %f6; \ + fsrc2 %x4, %f8; \ + fsrc2 %x5, %f10; \ + fsrc2 %x6, %f12; #define FREG_MOVE_8(x0, x1, x2, x3, x4, x5, x6, x7) \ - fmovd %x0, %f0; \ - fmovd %x1, %f2; \ - fmovd %x2, %f4; \ - fmovd %x3, %f6; \ - fmovd %x4, %f8; \ - fmovd %x5, %f10; \ - fmovd %x6, %f12; \ - fmovd %x7, %f14; + fsrc2 %x0, %f0; \ + fsrc2 %x1, %f2; \ + fsrc2 %x2, %f4; \ + fsrc2 %x3, %f6; \ + fsrc2 %x4, %f8; \ + fsrc2 %x5, %f10; \ + fsrc2 %x6, %f12; \ + fsrc2 %x7, %f14; #define FREG_LOAD_1(base, x0) \ LOAD(ldd, base + 0x00, %x0) #define FREG_LOAD_2(base, x0, x1) \ -- 2.7.4