Faster memcpy on x64.

author Ondrej Bilka <neleai@seznam.cz>

Mon, 20 May 2013 06:20:00 +0000 (08:20 +0200)

committer Ondrej Bilka <neleai@seznam.cz>

Mon, 20 May 2013 06:24:41 +0000 (08:24 +0200)
author Ondrej Bilka <neleai@seznam.cz>
Mon, 20 May 2013 06:20:00 +0000 (08:20 +0200)
committer Ondrej Bilka <neleai@seznam.cz>
Mon, 20 May 2013 06:24:41 +0000 (08:24 +0200)
diff --git a/ChangeLog b/ChangeLog

index a2323fa..6c540a7 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+2013-05-20  Ondřej Bílka  <neleai@seznam.cz>
+
+       * sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S: New file.
+       * sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Add
+       __memcpy_sse2_unaligned ifunc selection.
+       * sysdeps/x86_64/multiarch/Makefile (sysdep_routines):
+       Add memcpy-sse2-unaligned.S.
+       * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list):
+       Add: __memcpy_sse2_unaligned.
+
  2013-05-19  Joseph Myers  <joseph@codesourcery.com>
  
         [BZ #15490]
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile

index 86787ee..203d16e 100644 (file)
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -7,7 +7,7 @@ endif
  ifeq ($(subdir),string)
  
  sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
-                  strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
+                  strend-sse4 memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned mempcpy-ssse3 \
                    memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
                    memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
                    strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c

index 05315fd..28d3579 100644 (file)
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -227,6 +227,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
               IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
                               __memcpy_ssse3_back)
               IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
+             IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
               IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
  
    /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S.  */
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S

new file mode 100644 (file)

index 0000000..efdfea2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -0,0 +1,175 @@
+/* memcpy with unaliged loads
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#include "asm-syntax.h"
+
+#ifndef ALIGN
+# define ALIGN(n)      .p2align n
+#endif
+
+
+ENTRY(__memcpy_sse2_unaligned)
+       movq    %rsi, %rax
+       leaq    (%rdx,%rdx), %rcx
+       subq    %rdi, %rax
+       subq    %rdx, %rax
+       cmpq    %rcx, %rax
+       jb      L(overlapping)
+       cmpq    $16, %rdx
+       jbe     L(less_16)
+       movdqu  (%rsi), %xmm8
+       cmpq    $32, %rdx
+       movdqu  %xmm8, (%rdi)
+       movdqu  -16(%rsi,%rdx), %xmm8
+       movdqu  %xmm8, -16(%rdi,%rdx)
+       ja      .L31
+L(return):
+       movq    %rdi, %rax
+       ret
+       .p2align 4,,10
+       ALIGN(4)
+.L31:
+       movdqu  16(%rsi), %xmm8
+       cmpq    $64, %rdx
+       movdqu  %xmm8, 16(%rdi)
+       movdqu  -32(%rsi,%rdx), %xmm8
+       movdqu  %xmm8, -32(%rdi,%rdx)
+       jbe     L(return)
+       movdqu  32(%rsi), %xmm8
+       cmpq    $128, %rdx
+       movdqu  %xmm8, 32(%rdi)
+       movdqu  -48(%rsi,%rdx), %xmm8
+       movdqu  %xmm8, -48(%rdi,%rdx)
+       movdqu  48(%rsi), %xmm8
+       movdqu  %xmm8, 48(%rdi)
+       movdqu  -64(%rsi,%rdx), %xmm8
+       movdqu  %xmm8, -64(%rdi,%rdx)
+       jbe     L(return)
+       leaq    64(%rdi), %rcx
+       addq    %rdi, %rdx
+       andq    $-64, %rdx
+       andq    $-64, %rcx
+       movq    %rcx, %rax
+       subq    %rdi, %rax
+       addq    %rax, %rsi
+       cmpq    %rdx, %rcx
+       je      L(return)
+       movq    %rsi, %r10
+       subq    %rcx, %r10
+       leaq    16(%r10), %r9
+       leaq    32(%r10), %r8
+       leaq    48(%r10), %rax
+       .p2align 4,,10
+       ALIGN(4)
+L(loop):
+       movdqu  (%rcx,%r10), %xmm8
+       movdqa  %xmm8, (%rcx)
+       movdqu  (%rcx,%r9), %xmm8
+       movdqa  %xmm8, 16(%rcx)
+       movdqu  (%rcx,%r8), %xmm8
+       movdqa  %xmm8, 32(%rcx)
+       movdqu  (%rcx,%rax), %xmm8
+       movdqa  %xmm8, 48(%rcx)
+       addq    $64, %rcx
+       cmpq    %rcx, %rdx
+       jne     L(loop)
+       jmp     L(return)
+L(overlapping):
+       cmpq    %rsi, %rdi
+       jae     .L3
+       testq   %rdx, %rdx
+       .p2align 4,,5
+       je      L(return)
+       movq    %rdx, %r9
+       leaq    16(%rsi), %rcx
+       leaq    16(%rdi), %r8
+       shrq    $4, %r9
+       movq    %r9, %rax
+       salq    $4, %rax
+       cmpq    %rcx, %rdi
+       setae   %cl
+       cmpq    %r8, %rsi
+       setae   %r8b
+       orl     %r8d, %ecx
+       cmpq    $15, %rdx
+       seta    %r8b
+       testb   %r8b, %cl
+       je      .L16
+       testq   %rax, %rax
+       je      .L16
+       xorl    %ecx, %ecx
+       xorl    %r8d, %r8d
+.L7:
+       movdqu  (%rsi,%rcx), %xmm8
+       addq    $1, %r8
+       movdqu  %xmm8, (%rdi,%rcx)
+       addq    $16, %rcx
+       cmpq    %r8, %r9
+       ja      .L7
+       cmpq    %rax, %rdx
+       je      L(return)
+.L21:
+       movzbl  (%rsi,%rax), %ecx
+       movb    %cl, (%rdi,%rax)
+       addq    $1, %rax
+       cmpq    %rax, %rdx
+       ja      .L21
+       jmp     L(return)
+L(less_16):
+       testb   $24, %dl
+       jne     L(between_9_16)
+       testb   $4, %dl
+       .p2align 4,,5
+       jne     L(between_5_8)
+       testq   %rdx, %rdx
+       .p2align 4,,2
+       je      L(return)
+       movzbl  (%rsi), %eax
+       testb   $2, %dl
+       movb    %al, (%rdi)
+       je      L(return)
+       movzwl  -2(%rsi,%rdx), %eax
+       movw    %ax, -2(%rdi,%rdx)
+       jmp     L(return)
+.L3:
+       leaq    -1(%rdx), %rax
+       .p2align 4,,10
+       ALIGN(4)
+.L11:
+       movzbl  (%rsi,%rax), %edx
+       movb    %dl, (%rdi,%rax)
+       subq    $1, %rax
+       jmp     .L11
+L(between_9_16):
+       movq    (%rsi), %rax
+       movq    %rax, (%rdi)
+       movq    -8(%rsi,%rdx), %rax
+       movq    %rax, -8(%rdi,%rdx)
+       jmp     L(return)
+.L16:
+       xorl    %eax, %eax
+       jmp     .L21
+L(between_5_8):
+       movl    (%rsi), %eax
+       movl    %eax, (%rdi)
+       movl    -4(%rsi,%rdx), %eax
+       movl    %eax, -4(%rdi,%rdx)
+       jmp     L(return)
+END(__memcpy_sse2_unaligned)
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S

index b452f53..a1e5031 100644 (file)
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -33,13 +33,14 @@ ENTRY(__new_memcpy)
         jne     1f
         call    __init_cpu_features
  1:     leaq    __memcpy_sse2(%rip), %rax
-       testl   $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
-       jz      2f
-       leaq    __memcpy_ssse3(%rip), %rax
-       testl   $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
-       jz      2f
-       leaq    __memcpy_ssse3_back(%rip), %rax
-2:     ret
+       testl   $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
+       jnz     2f
+       leaq    __memcpy_sse2_unaligned(%rip), %rax
+       ret
+2:     testl   $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+       jz 3f
+       leaq    __memcpy_ssse3(%rip), %rax
+3:     ret
  END(__new_memcpy)
  
  # undef ENTRY
author	Ondrej Bilka <neleai@seznam.cz>
	Mon, 20 May 2013 06:20:00 +0000 (08:20 +0200)
committer	Ondrej Bilka <neleai@seznam.cz>
	Mon, 20 May 2013 06:24:41 +0000 (08:24 +0200)
ChangeLog		patch \| blob \| history
sysdeps/x86_64/multiarch/Makefile		patch \| blob \| history
sysdeps/x86_64/multiarch/ifunc-impl-list.c		patch \| blob \| history
sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/multiarch/memcpy.S		patch \| blob \| history