memcpy for ppc/cell.

author Ulrich Drepper <drepper@redhat.com>

Mon, 18 Jan 2010 20:40:29 +0000 (12:40 -0800)

committer Ulrich Drepper <drepper@redhat.com>

Mon, 18 Jan 2010 20:40:29 +0000 (12:40 -0800)
author Ulrich Drepper <drepper@redhat.com>
Mon, 18 Jan 2010 20:40:29 +0000 (12:40 -0800)
committer Ulrich Drepper <drepper@redhat.com>
Mon, 18 Jan 2010 20:40:29 +0000 (12:40 -0800)
diff --git a/ChangeLog b/ChangeLog

index 92ed81c..8f6695b 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2010-01-14  Ryan S. Arnold  <rsa@us.ibm.com>
+
+       * sysdeps/powerpc/powerpc32/cell/memcpy.S: New file.
+       * sysdeps/powerpc/powerpc64/cell/memcpy.S: New file.
+       * sysdeps/unix/sysv/linux/powerpc/powerpc32/cell/fpu/Implies: New file.
+       * sysdeps/unix/sysv/linux/powerpc/powerpc64/cell/fpu/Implies: New file.
+
  2010-01-18  Andreas Schwab  <schwab@redhat.com>
  
         * sysdeps/unix/sysv/linux/sparc/bits/fcntl.h: Remove duplicate
diff --git a/sysdeps/powerpc/powerpc32/cell/memcpy.S b/sysdeps/powerpc/powerpc32/cell/memcpy.S

new file mode 100644 (file)

index 0000000..e6c076c
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/cell/memcpy.S
@@ -0,0 +1,245 @@
+/* Optimized memcpy implementation for CELL BE PowerPC.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+
+#define PREFETCH_AHEAD 6       /* no cache lines SRC prefetching ahead  */
+#define ZERO_AHEAD 4           /* no cache lines DST zeroing ahead  */
+
+/* memcpy routine optimized for CELL-BE-PPC    v2.0
+ *
+ * The CELL PPC core has 1 integer unit and 1 load/store unit
+ * CELL:
+ * 1st level data cache = 32K
+ * 2nd level data cache = 512K
+ * 3rd level data cache = 0K
+ * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks,
+ * latency to memory is >400 clocks
+ * To improve copy performance we need to prefetch source data
+ * far ahead to hide this latency
+ * For best performance instructionforms ending in "." like "andi."
+ * should be avoided as the are implemented in microcode on CELL.
+ * The below code is loop unrolled for the CELL cache line of 128 bytes
+ */
+
+.align  7
+
+EALIGN (BP_SYM (memcpy), 5, 0)
+        CALL_MCOUNT
+
+       dcbt    0,r4            /* Prefetch ONE SRC cacheline  */
+       cmplwi  cr1,r5,16       /* is size < 16 ?  */
+       mr      r6,r3           
+       blt+    cr1,.Lshortcopy
+
+.Lbigcopy:
+       neg     r8,r3           /* LS 3 bits = # bytes to 8-byte dest bdry  */
+        clrlwi  r8,r8,32-4     /* aling to 16byte boundary  */
+       sub     r7,r4,r3
+       cmplwi  cr0,r8,0
+       beq+    .Ldst_aligned
+
+.Ldst_unaligned:
+       mtcrf   0x01,r8         /* put #bytes to boundary into cr7  */
+       subf    r5,r8,r5
+
+       bf      cr7*4+3,1f
+       lbzx    r0,r7,r6        /* copy 1 byte  */
+       stb     r0,0(r6)
+       addi    r6,r6,1
+1:     bf      cr7*4+2,2f
+       lhzx    r0,r7,r6        /* copy 2 byte  */
+       sth     r0,0(r6)
+       addi    r6,r6,2
+2:     bf      cr7*4+1,4f
+       lwzx    r0,r7,r6        /* copy 4 byte  */
+       stw     r0,0(r6)
+       addi    r6,r6,4
+4:     bf      cr7*4+0,8f
+       lfdx    fp9,r7,r6       /* copy 8 byte  */
+       stfd    fp9,0(r6)
+       addi    r6,r6,8
+8:
+       add     r4,r7,r6
+
+.Ldst_aligned:
+
+       cmpwi   cr5,r5,128-1
+
+       neg     r7,r6
+       addi    r6,r6,-8        /* prepare for stfdu  */
+       addi    r4,r4,-8        /* prepare for lfdu  */
+
+       clrlwi  r7,r7,32-7      /* align to cacheline boundary  */
+       ble+    cr5,.Llessthancacheline
+
+       cmplwi  cr6,r7,0
+       subf    r5,r7,r5
+       srwi    r7,r7,4         /* divide size by 16  */
+       srwi    r10,r5,7        /* number of cache lines to copy  */
+
+       cmplwi  r10,0
+       li      r11,0           /* number cachelines to copy with prefetch  */
+       beq     .Lnocacheprefetch
+
+       cmplwi  r10,PREFETCH_AHEAD
+       li      r12,128+8       /* prefetch distance  */
+       ble     .Llessthanmaxprefetch
+
+       subi    r11,r10,PREFETCH_AHEAD
+       li      r10,PREFETCH_AHEAD
+
+.Llessthanmaxprefetch:
+       mtctr   r10
+
+.LprefetchSRC:
+       dcbt    r12,r4
+        addi    r12,r12,128
+        bdnz    .LprefetchSRC
+
+.Lnocacheprefetch:
+       mtctr   r7
+       cmplwi  cr1,r5,128
+       clrlwi  r5,r5,32-7
+       beq     cr6,.Lcachelinealigned
+
+.Laligntocacheline:
+       lfd     fp9,0x08(r4)
+       lfdu    fp10,0x10(r4)
+       stfd    fp9,0x08(r6)
+       stfdu   fp10,0x10(r6)
+       bdnz    .Laligntocacheline
+
+
+.Lcachelinealigned:            /* copy while cache lines  */
+
+       blt-    cr1,.Llessthancacheline /* size <128  */
+
+.Louterloop:
+        cmpwi   r11,0
+       mtctr   r11
+       beq-    .Lendloop
+
+       li      r11,128*ZERO_AHEAD +8   /* DCBZ dist  */
+
+.align 4
+       /* Copy whole cachelines, optimized by prefetching SRC cacheline  */
+.Lloop:                        /* Copy aligned body  */
+       dcbt    r12,r4          /* PREFETCH SOURCE some cache lines ahead  */
+       lfd     fp9, 0x08(r4)
+       dcbz    r11,r6
+       lfd     fp10, 0x10(r4)  /* 4 register stride copy is optimal  */
+       lfd     fp11, 0x18(r4)  /* to hide 1st level cache lantency.  */
+       lfd     fp12, 0x20(r4)
+       stfd    fp9, 0x08(r6)
+       stfd    fp10, 0x10(r6)
+       stfd    fp11, 0x18(r6)
+       stfd    fp12, 0x20(r6)
+       lfd     fp9, 0x28(r4)
+       lfd     fp10, 0x30(r4)
+       lfd     fp11, 0x38(r4)
+       lfd     fp12, 0x40(r4)
+       stfd    fp9, 0x28(r6)
+       stfd    fp10, 0x30(r6)
+       stfd    fp11, 0x38(r6)
+       stfd    fp12, 0x40(r6)
+       lfd     fp9, 0x48(r4)
+       lfd     fp10, 0x50(r4)
+       lfd     fp11, 0x58(r4)
+       lfd     fp12, 0x60(r4)
+       stfd    fp9, 0x48(r6)
+       stfd    fp10, 0x50(r6)
+       stfd    fp11, 0x58(r6)
+       stfd    fp12, 0x60(r6)
+       lfd     fp9, 0x68(r4)
+       lfd     fp10, 0x70(r4)
+       lfd     fp11, 0x78(r4)
+       lfdu    fp12, 0x80(r4)
+       stfd    fp9, 0x68(r6)
+       stfd    fp10, 0x70(r6)
+       stfd    fp11, 0x78(r6)
+       stfdu   fp12, 0x80(r6)
+
+       bdnz    .Lloop
+
+.Lendloop:
+       cmpwi   r10,0
+       slwi    r10,r10,2       /* adjust from 128 to 32 byte stride  */
+       beq-    .Lendloop2
+       mtctr   r10
+
+.Lloop2:                       /* Copy aligned body  */
+       lfd     fp9, 0x08(r4)
+       lfd     fp10, 0x10(r4)
+       lfd     fp11, 0x18(r4)
+       lfdu    fp12, 0x20(r4)
+       stfd    fp9, 0x08(r6)
+       stfd    fp10, 0x10(r6)
+       stfd    fp11, 0x18(r6)
+       stfdu   fp12, 0x20(r6)
+
+       bdnz    .Lloop2
+.Lendloop2:
+
+.Llessthancacheline:           /* less than cache to do ?  */
+       cmplwi  cr0,r5,16
+       srwi    r7,r5,4         /* divide size by 16  */
+       blt-    .Ldo_lt16
+       mtctr   r7
+
+.Lcopy_remaining:
+       lfd     fp9,0x08(r4)
+       lfdu    fp10,0x10(r4)
+       stfd    fp9,0x08(r6)
+       stfdu   fp10,0x10(r6)
+       bdnz    .Lcopy_remaining
+
+.Ldo_lt16:                     /* less than 16 ?  */
+       cmplwi  cr0,r5,0        /* copy remaining bytes (0-15)  */
+       beqlr+                  /* no rest to copy  */  
+       addi    r4,r4,8
+       addi    r6,r6,8
+
+.Lshortcopy:                   /* SIMPLE COPY to handle size =< 15 bytes  */
+       mtcrf   0x01,r5
+       sub     r7,r4,r6
+       bf-     cr7*4+0,8f
+       lfdx    fp9,r7,r6       /* copy 8 byte  */
+       stfd    fp9,0(r6)
+       addi    r6,r6,8
+8:
+       bf      cr7*4+1,4f
+       lwzx    r0,r7,r6        /* copy 4 byte  */
+       stw     r0,0(r6)
+       addi    r6,r6,4
+4:
+       bf      cr7*4+2,2f
+       lhzx    r0,r7,r6        /* copy 2 byte  */
+       sth     r0,0(r6)
+       addi    r6,r6,2
+2:
+       bf      cr7*4+3,1f
+       lbzx    r0,r7,r6        /* copy 1 byte  */
+       stb     r0,0(r6)
+1:     blr
+
+END (BP_SYM (memcpy))
+libc_hidden_builtin_def (memcpy)
diff --git a/sysdeps/powerpc/powerpc64/cell/memcpy.S b/sysdeps/powerpc/powerpc64/cell/memcpy.S

new file mode 100644 (file)

index 0000000..2a00a6e
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/cell/memcpy.S
@@ -0,0 +1,245 @@
+/* Optimized memcpy implementation for CELL BE PowerPC.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+
+#define PREFETCH_AHEAD 6       /* no cache lines SRC prefetching ahead  */
+#define ZERO_AHEAD 4           /* no cache lines DST zeroing ahead  */
+
+/* memcpy routine optimized for CELL-BE-PPC    v2.0
+ *
+ * The CELL PPC core has 1 integer unit and 1 load/store unit
+ * CELL:
+ * 1st level data cache = 32K
+ * 2nd level data cache = 512K
+ * 3rd level data cache = 0K
+ * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks,
+ * latency to memory is >400 clocks
+ * To improve copy performance we need to prefetch source data
+ * far ahead to hide this latency
+ * For best performance instructionforms ending in "." like "andi."
+ * should be avoided as the are implemented in microcode on CELL.
+ * The below code is loop unrolled for the CELL cache line of 128 bytes
+ */
+
+.align  7
+
+EALIGN (BP_SYM (memcpy), 5, 0)
+        CALL_MCOUNT 3
+
+       dcbt    0,r4            /* Prefetch ONE SRC cacheline  */
+       cmpldi  cr1,r5,16       /* is size < 16 ?  */
+       mr      r6,r3           
+       blt+    cr1,.Lshortcopy
+
+.Lbigcopy:
+       neg     r8,r3           /* LS 3 bits = # bytes to 8-byte dest bdry  */
+        clrldi  r8,r8,64-4     /* aling to 16byte boundary  */
+       sub     r7,r4,r3
+       cmpldi  cr0,r8,0
+       beq+    .Ldst_aligned
+
+.Ldst_unaligned:
+       mtcrf   0x01,r8         /* put #bytes to boundary into cr7  */
+       subf    r5,r8,r5
+
+       bf      cr7*4+3,1f
+       lbzx    r0,r7,r6        /* copy 1 byte  */
+       stb     r0,0(r6)
+       addi    r6,r6,1
+1:     bf      cr7*4+2,2f
+       lhzx    r0,r7,r6        /* copy 2 byte  */
+       sth     r0,0(r6)
+       addi    r6,r6,2
+2:     bf      cr7*4+1,4f
+       lwzx    r0,r7,r6        /* copy 4 byte  */
+       stw     r0,0(r6)
+       addi    r6,r6,4
+4:     bf      cr7*4+0,8f
+       ldx     r0,r7,r6        /* copy 8 byte  */
+       std     r0,0(r6)
+       addi    r6,r6,8
+8:
+       add     r4,r7,r6
+
+.Ldst_aligned:
+
+       cmpdi   cr5,r5,128-1
+
+       neg     r7,r6
+       addi    r6,r6,-8        /* prepare for stdu  */
+       addi    r4,r4,-8        /* prepare for ldu  */
+
+       clrldi  r7,r7,64-7      /* align to cacheline boundary  */
+       ble+    cr5,.Llessthancacheline
+
+       cmpldi  cr6,r7,0
+       subf    r5,r7,r5
+       srdi    r7,r7,4         /* divide size by 16  */
+       srdi    r10,r5,7        /* number of cache lines to copy  */
+
+       cmpldi  r10,0
+       li      r11,0           /* number cachelines to copy with prefetch  */
+       beq     .Lnocacheprefetch
+
+       cmpldi  r10,PREFETCH_AHEAD
+       li      r12,128+8       /* prefetch distance  */
+       ble     .Llessthanmaxprefetch
+
+       subi    r11,r10,PREFETCH_AHEAD
+       li      r10,PREFETCH_AHEAD
+
+.Llessthanmaxprefetch:
+       mtctr   r10
+
+.LprefetchSRC:
+       dcbt    r12,r4
+        addi    r12,r12,128
+        bdnz    .LprefetchSRC
+
+.Lnocacheprefetch:
+       mtctr   r7
+       cmpldi  cr1,r5,128
+       clrldi  r5,r5,64-7
+       beq     cr6,.Lcachelinealigned
+
+.Laligntocacheline:
+       ld      r9,0x08(r4)
+       ldu     r7,0x10(r4)
+       std     r9,0x08(r6)
+       stdu    r7,0x10(r6)
+       bdnz    .Laligntocacheline
+
+
+.Lcachelinealigned:            /* copy while cache lines  */
+
+       blt-    cr1,.Llessthancacheline /* size <128  */
+
+.Louterloop:
+        cmpdi   r11,0
+       mtctr   r11
+       beq-    .Lendloop
+
+       li      r11,128*ZERO_AHEAD +8   /* DCBZ dist  */
+
+.align 4
+       /* Copy whole cachelines, optimized by prefetching SRC cacheline  */
+.Lloop:                        /* Copy aligned body  */
+       dcbt    r12,r4          /* PREFETCH SOURCE some cache lines ahead  */
+       ld      r9, 0x08(r4)
+       dcbz    r11,r6
+       ld      r7, 0x10(r4)    /* 4 register stride copy is optimal  */
+       ld      r8, 0x18(r4)    /* to hide 1st level cache lantency.  */
+       ld      r0, 0x20(r4)
+       std     r9, 0x08(r6)
+       std     r7, 0x10(r6)
+       std     r8, 0x18(r6)
+       std     r0, 0x20(r6)
+       ld      r9, 0x28(r4)
+       ld      r7, 0x30(r4)
+       ld      r8, 0x38(r4)
+       ld      r0, 0x40(r4)
+       std     r9, 0x28(r6)
+       std     r7, 0x30(r6)
+       std     r8, 0x38(r6)
+       std     r0, 0x40(r6)
+       ld      r9, 0x48(r4)
+       ld      r7, 0x50(r4)
+       ld      r8, 0x58(r4)
+       ld      r0, 0x60(r4)
+       std     r9, 0x48(r6)
+       std     r7, 0x50(r6)
+       std     r8, 0x58(r6)
+       std     r0, 0x60(r6)
+       ld      r9, 0x68(r4)
+       ld      r7, 0x70(r4)
+       ld      r8, 0x78(r4)
+       ldu     r0, 0x80(r4)
+       std     r9, 0x68(r6)
+       std     r7, 0x70(r6)
+       std     r8, 0x78(r6)
+       stdu    r0, 0x80(r6)
+
+       bdnz    .Lloop
+
+.Lendloop:
+       cmpdi   r10,0
+       sldi    r10,r10,2       /* adjust from 128 to 32 byte stride  */
+       beq-    .Lendloop2
+       mtctr   r10
+
+.Lloop2:                       /* Copy aligned body  */
+       ld      r9, 0x08(r4)
+       ld      r7, 0x10(r4)
+       ld      r8, 0x18(r4)
+       ldu     r0, 0x20(r4)
+       std     r9, 0x08(r6)
+       std     r7, 0x10(r6)
+       std     r8, 0x18(r6)
+       stdu    r0, 0x20(r6)
+
+       bdnz    .Lloop2
+.Lendloop2:
+
+.Llessthancacheline:           /* less than cache to do ?  */
+       cmpldi  cr0,r5,16
+       srdi    r7,r5,4         /* divide size by 16  */
+       blt-    .Ldo_lt16
+       mtctr   r7
+
+.Lcopy_remaining:
+       ld      r8,0x08(r4)
+       ldu     r7,0x10(r4)
+       std     r8,0x08(r6)
+       stdu    r7,0x10(r6)
+       bdnz    .Lcopy_remaining
+
+.Ldo_lt16:                     /* less than 16 ?  */
+       cmpldi  cr0,r5,0        /* copy remaining bytes (0-15)  */
+       beqlr+                  /* no rest to copy  */  
+       addi    r4,r4,8
+       addi    r6,r6,8
+
+.Lshortcopy:                   /* SIMPLE COPY to handle size =< 15 bytes  */
+       mtcrf   0x01,r5
+       sub     r7,r4,r6
+       bf-     cr7*4+0,8f
+       ldx     r0,r7,r6        /* copy 8 byte  */
+       std     r0,0(r6)
+       addi    r6,r6,8
+8:
+       bf      cr7*4+1,4f
+       lwzx    r0,r7,r6        /* copy 4 byte  */
+       stw     r0,0(r6)
+       addi    r6,r6,4
+4:
+       bf      cr7*4+2,2f
+       lhzx    r0,r7,r6        /* copy 2 byte  */
+       sth     r0,0(r6)
+       addi    r6,r6,2
+2:
+       bf      cr7*4+3,1f
+       lbzx    r0,r7,r6        /* copy 1 byte  */
+       stb     r0,0(r6)
+1:     blr
+
+END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
+libc_hidden_builtin_def (memcpy)
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/cell/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32/cell/fpu/Implies

new file mode 100644 (file)

index 0000000..7c381f0
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/cell/fpu/Implies
@@ -0,0 +1,3 @@
+# Make sure this comes before the powerpc/powerpc32/fpu that's
+# listed in unix/sysv/linux/powerpc/powerpc32/fpu/Implies.
+powerpc/powerpc32/cell/fpu
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/cell/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64/cell/fpu/Implies

new file mode 100644 (file)

index 0000000..b6720ec
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/cell/fpu/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/cell/fpu
author	Ulrich Drepper <drepper@redhat.com>
	Mon, 18 Jan 2010 20:40:29 +0000 (12:40 -0800)
committer	Ulrich Drepper <drepper@redhat.com>
	Mon, 18 Jan 2010 20:40:29 +0000 (12:40 -0800)
ChangeLog		patch \| blob \| history
sysdeps/powerpc/powerpc32/cell/memcpy.S	[new file with mode: 0644]	patch \| blob
sysdeps/powerpc/powerpc64/cell/memcpy.S	[new file with mode: 0644]	patch \| blob
sysdeps/unix/sysv/linux/powerpc/powerpc32/cell/fpu/Implies	[new file with mode: 0644]	patch \| blob
sysdeps/unix/sysv/linux/powerpc/powerpc64/cell/fpu/Implies	[new file with mode: 0644]	patch \| blob