From 99710781cc47002612e609c7dc5f34692b64e9b3 Mon Sep 17 00:00:00 2001 From: Liubov Dmitrieva Date: Tue, 19 Jul 2011 17:11:54 -0400 Subject: [PATCH] Improve 64 bit strcat functions with SSE2/SSSE3 --- ChangeLog | 29 ++ NEWS | 5 +- string/strncat.c | 6 +- sysdeps/x86_64/multiarch/Makefile | 6 +- sysdeps/x86_64/multiarch/init-arch.c | 10 +- sysdeps/x86_64/multiarch/init-arch.h | 2 + sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S | 55 +++ sysdeps/x86_64/multiarch/strcat-ssse3.S | 559 ++++++++++++++++++++++ sysdeps/x86_64/multiarch/strcat.S | 85 ++++ sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S | 451 +++++++++++------ sysdeps/x86_64/multiarch/strcpy-ssse3.S | 280 +++++------ sysdeps/x86_64/multiarch/strlen-no-bsf.S | 74 +-- sysdeps/x86_64/multiarch/strlen-sse2-pminub.S | 260 ++++++++++ sysdeps/x86_64/multiarch/strlen.S | 5 +- sysdeps/x86_64/multiarch/strncat-c.c | 8 + sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S | 3 + sysdeps/x86_64/multiarch/strncat-ssse3.S | 3 + sysdeps/x86_64/multiarch/strncat.S | 3 + 18 files changed, 1523 insertions(+), 321 deletions(-) create mode 100644 sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S create mode 100644 sysdeps/x86_64/multiarch/strcat-ssse3.S create mode 100644 sysdeps/x86_64/multiarch/strcat.S create mode 100644 sysdeps/x86_64/multiarch/strlen-sse2-pminub.S create mode 100644 sysdeps/x86_64/multiarch/strncat-c.c create mode 100644 sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S create mode 100644 sysdeps/x86_64/multiarch/strncat-ssse3.S create mode 100644 sysdeps/x86_64/multiarch/strncat.S diff --git a/ChangeLog b/ChangeLog index 0932ae5..e3dc2ee 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,32 @@ +2011-07-15 Liubov Dmitrieva + + * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add + strcat-ssse3 strcat-sse2-unaligned strncat-ssse3 + strncat-sse2-unaligned strncat-c strlen-sse2-pminub + * sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S: New file. + * sysdeps/x86_64/multiarch/strcat.S: New file. + * sysdeps/x86_64/multiarch/strncat.S: New file. + * sysdeps/x86_64/multiarch/strncat-c.c: New file. + * sysdeps/x86_64/multiarch/strcat-ssse3.S: New file. + * sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S: New file. + * sysdeps/x86_64/multiarch/strncat-ssse3.S: New file. + * sysdeps/x86_64/multiarch/strcpy-ssse3.S + (USE_AS_STRCAT): Define. + Add strcat and strncat support. + * sysdeps/x86_64/multiarch/strlen-no-bsf.S: Likewise. + * sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: Likewise. + * sysdeps/x86_64/multiarch/strlen-sse2-pminub.S: New file. + * string/strncat.c: Update. + (USE_AS_STRNCAT): Define. + * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): + Turn on bit_Prefer_PMINUB_for_stringop for Intel Core i3, i5 + and i7. + * sysdeps/x86_64/multiarch/init-arch.h + (bit_Prefer_PMINUB_for_stringop): New. + (index_Prefer_PMINUB_for_stringop): Likewise. + * sysdeps/x86_64/multiarch/strlen.S (strlen): Check + bit_Prefer_PMINUB_for_stringop. + 2011-07-19 Ulrich Drepper * crypt/sha512.h (struct sha512_ctx): Move buffer into union and add diff --git a/NEWS b/NEWS index f3cead3..fb2c15e 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,4 @@ -GNU C Library NEWS -- history of user-visible changes. 2011-7-6 +GNU C Library NEWS -- history of user-visible changes. 2011-7-19 Copyright (C) 1992-2009, 2010, 2011 Free Software Foundation, Inc. See the end for copying conditions. @@ -23,6 +23,9 @@ Version 2.15 * Improved strcpy, strncpy, stpcpy, stpncpy for SSE2 and SSSE3 on x86-64. Contributed by HJ Lu. + +* Improved strcat and strncat on x86-64. + Contributed by Liubov Dmitrieva. Version 2.14 diff --git a/string/strncat.c b/string/strncat.c index 2e2de11..72d9d69 100644 --- a/string/strncat.c +++ b/string/strncat.c @@ -24,10 +24,12 @@ typedef char reg_char; #endif -#undef strncat +#ifndef STRNCAT +# define STRNCAT strncat +#endif char * -strncat (s1, s2, n) +STRNCAT (s1, s2, n) char *s1; const char *s2; size_t n; diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 88410b3..c959dd1 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -5,14 +5,16 @@ endif ifeq ($(subdir),string) -sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ +sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \ memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \ strncase_l-ssse3 strlen-sse4 strlen-no-bsf memset-x86-64 \ strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \ strcpy-sse2-unaligned strncpy-sse2-unaligned \ - stpcpy-sse2-unaligned stpncpy-sse2-unaligned + stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ + strcat-sse2-unaligned strncat-sse2-unaligned \ + strcat-ssse3 strncat-ssse3 strlen-sse2-pminub ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift CFLAGS-varshift.c += -msse4 diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c index 81b2378..0a145ca 100644 --- a/sysdeps/x86_64/multiarch/init-arch.c +++ b/sysdeps/x86_64/multiarch/init-arch.c @@ -97,18 +97,22 @@ __init_cpu_features (void) case 0x2c: case 0x2e: case 0x2f: - /* Rep string instructions, copy backward and unaligned loads - are fast on Intel Core i3, i5 and i7. */ + /* Rep string instructions, copy backward, unaligned loads + and pminub are fast on Intel Core i3, i5 and i7. */ #if index_Fast_Rep_String != index_Fast_Copy_Backward # error index_Fast_Rep_String != index_Fast_Copy_Backward #endif #if index_Fast_Rep_String != index_Fast_Unaligned_Load # error index_Fast_Rep_String != index_Fast_Unaligned_Load #endif +#if index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop +# error index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop +#endif __cpu_features.feature[index_Fast_Rep_String] |= (bit_Fast_Rep_String | bit_Fast_Copy_Backward - | bit_Fast_Unaligned_Load); + | bit_Fast_Unaligned_Load + | bit_Prefer_PMINUB_for_stringop); break; } } diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h index addf5f3..6cfdbdd 100644 --- a/sysdeps/x86_64/multiarch/init-arch.h +++ b/sysdeps/x86_64/multiarch/init-arch.h @@ -21,6 +21,7 @@ #define bit_Slow_BSF (1 << 2) #define bit_Prefer_SSE_for_memop (1 << 3) #define bit_Fast_Unaligned_Load (1 << 4) +#define bit_Prefer_PMINUB_for_stringop (1 << 5) #ifdef __ASSEMBLER__ @@ -41,6 +42,7 @@ # define index_Slow_BSF FEATURE_INDEX_1*FEATURE_SIZE # define index_Prefer_SSE_for_memop FEATURE_INDEX_1*FEATURE_SIZE # define index_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE +# define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1*FEATURE_SIZE #else /* __ASSEMBLER__ */ diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S new file mode 100644 index 0000000..1150281 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S @@ -0,0 +1,55 @@ +/* strcat with SSE2 + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +# include + +# ifndef STRCAT +# define STRCAT __strcat_sse2_unaligned +# endif + +# define USE_AS_STRCAT + +.text +ENTRY (STRCAT) + mov %rdi, %r9 +# ifdef USE_AS_STRNCAT + mov %rdx, %r8 +# endif + +# define RETURN jmp L(StartStrcpyPart) +# include "strlen-sse2-pminub.S" +# undef RETURN + +L(StartStrcpyPart): + lea (%r9, %rax), %rdi + mov %rsi, %rcx + mov %r9, %rax /* save result */ + +# ifdef USE_AS_STRNCAT + test %r8, %r8 + jz L(ExitZero) +# define USE_AS_STRNCPY +# endif + +# include "strcpy-sse2-unaligned.S" +#endif + diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S new file mode 100644 index 0000000..66736a7 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcat-ssse3.S @@ -0,0 +1,559 @@ +/* strcat with SSSE3 + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +# include + +# ifndef STRCAT +# define STRCAT __strcat_ssse3 +# endif + +# define USE_AS_STRCAT + +.text +ENTRY (STRCAT) +# ifdef USE_AS_STRNCAT + mov %rdx, %r8 +# endif + +# define RETURN jmp L(StartStrcpyPart) +# include "strlen-no-bsf.S" + +# undef RETURN + +L(StartStrcpyPart): + mov %rsi, %rcx + lea (%rdi, %rax), %rdx +# ifdef USE_AS_STRNCAT + test %r8, %r8 + jz L(StrncatExit0) + cmp $8, %r8 + jbe L(StrncatExit8Bytes) +# endif + cmpb $0, (%rcx) + jz L(Exit1) + cmpb $0, 1(%rcx) + jz L(Exit2) + cmpb $0, 2(%rcx) + jz L(Exit3) + cmpb $0, 3(%rcx) + jz L(Exit4) + cmpb $0, 4(%rcx) + jz L(Exit5) + cmpb $0, 5(%rcx) + jz L(Exit6) + cmpb $0, 6(%rcx) + jz L(Exit7) + cmpb $0, 7(%rcx) + jz L(Exit8) + cmpb $0, 8(%rcx) + jz L(Exit9) +# ifdef USE_AS_STRNCAT + cmp $16, %r8 + jb L(StrncatExit15Bytes) +# endif + cmpb $0, 9(%rcx) + jz L(Exit10) + cmpb $0, 10(%rcx) + jz L(Exit11) + cmpb $0, 11(%rcx) + jz L(Exit12) + cmpb $0, 12(%rcx) + jz L(Exit13) + cmpb $0, 13(%rcx) + jz L(Exit14) + cmpb $0, 14(%rcx) + jz L(Exit15) + cmpb $0, 15(%rcx) + jz L(Exit16) +# ifdef USE_AS_STRNCAT + cmp $16, %r8 + je L(StrncatExit16) +# define USE_AS_STRNCPY +# endif + +# include "strcpy-ssse3.S" + + .p2align 4 +L(CopyFrom1To16Bytes): + add %rsi, %rdx + add %rsi, %rcx + + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + test $0x08, %al + jnz L(Exit4) + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + test $0x08, %ah + jnz L(Exit12) + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + movlpd (%rcx), %xmm0 + movlpd 8(%rcx), %xmm1 + movlpd %xmm0, (%rdx) + movlpd %xmm1, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit1): + xor %ah, %ah + movb %ah, 1(%rdx) +L(Exit1): + movb (%rcx), %al + movb %al, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit2): + xor %ah, %ah + movb %ah, 2(%rdx) +L(Exit2): + movw (%rcx), %ax + movw %ax, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit3): + xor %ah, %ah + movb %ah, 3(%rdx) +L(Exit3): + movw (%rcx), %ax + movw %ax, (%rdx) + movb 2(%rcx), %al + movb %al, 2(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit4): + xor %ah, %ah + movb %ah, 4(%rdx) +L(Exit4): + mov (%rcx), %eax + mov %eax, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit5): + xor %ah, %ah + movb %ah, 5(%rdx) +L(Exit5): + mov (%rcx), %eax + mov %eax, (%rdx) + movb 4(%rcx), %al + movb %al, 4(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit6): + xor %ah, %ah + movb %ah, 6(%rdx) +L(Exit6): + mov (%rcx), %eax + mov %eax, (%rdx) + movw 4(%rcx), %ax + movw %ax, 4(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit7): + xor %ah, %ah + movb %ah, 7(%rdx) +L(Exit7): + mov (%rcx), %eax + mov %eax, (%rdx) + mov 3(%rcx), %eax + mov %eax, 3(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit8): + xor %ah, %ah + movb %ah, 8(%rdx) +L(Exit8): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit9): + xor %ah, %ah + movb %ah, 9(%rdx) +L(Exit9): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movb 8(%rcx), %al + movb %al, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit10): + xor %ah, %ah + movb %ah, 10(%rdx) +L(Exit10): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movw 8(%rcx), %ax + movw %ax, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit11): + xor %ah, %ah + movb %ah, 11(%rdx) +L(Exit11): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + mov 7(%rcx), %eax + mov %eax, 7(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit12): + xor %ah, %ah + movb %ah, 12(%rdx) +L(Exit12): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + mov 8(%rcx), %eax + mov %eax, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit13): + xor %ah, %ah + movb %ah, 13(%rdx) +L(Exit13): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movlpd 5(%rcx), %xmm1 + movlpd %xmm1, 5(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit14): + xor %ah, %ah + movb %ah, 14(%rdx) +L(Exit14): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movlpd 6(%rcx), %xmm1 + movlpd %xmm1, 6(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit15): + xor %ah, %ah + movb %ah, 15(%rdx) +L(Exit15): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movlpd 7(%rcx), %xmm1 + movlpd %xmm1, 7(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit16): + xor %ah, %ah + movb %ah, 16(%rdx) +L(Exit16): + movlpd (%rcx), %xmm0 + movlpd 8(%rcx), %xmm1 + movlpd %xmm0, (%rdx) + movlpd %xmm1, 8(%rdx) + mov %rdi, %rax + ret + +# ifdef USE_AS_STRNCPY + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %r8 + add %rsi, %rcx + lea (%rsi, %rdx), %rsi + lea -9(%r8), %rdx + and $1<<7, %dh + or %al, %dh + test %dh, %dh + lea (%rsi), %rdx + jz L(ExitHighCase2) + + test $0x01, %al + jnz L(Exit1) + cmp $1, %r8 + je L(StrncatExit1) + test $0x02, %al + jnz L(Exit2) + cmp $2, %r8 + je L(StrncatExit2) + test $0x04, %al + jnz L(Exit3) + cmp $3, %r8 + je L(StrncatExit3) + test $0x08, %al + jnz L(Exit4) + cmp $4, %r8 + je L(StrncatExit4) + test $0x10, %al + jnz L(Exit5) + cmp $5, %r8 + je L(StrncatExit5) + test $0x20, %al + jnz L(Exit6) + cmp $6, %r8 + je L(StrncatExit6) + test $0x40, %al + jnz L(Exit7) + cmp $7, %r8 + je L(StrncatExit7) + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + lea 7(%rdx), %rax + cmpb $1, (%rax) + sbb $-1, %rax + xor %cl, %cl + movb %cl, (%rax) + mov %rdi, %rax + ret + + .p2align 4 +L(ExitHighCase2): + test $0x01, %ah + jnz L(Exit9) + cmp $9, %r8 + je L(StrncatExit9) + test $0x02, %ah + jnz L(Exit10) + cmp $10, %r8 + je L(StrncatExit10) + test $0x04, %ah + jnz L(Exit11) + cmp $11, %r8 + je L(StrncatExit11) + test $0x8, %ah + jnz L(Exit12) + cmp $12, %r8 + je L(StrncatExit12) + test $0x10, %ah + jnz L(Exit13) + cmp $13, %r8 + je L(StrncatExit13) + test $0x20, %ah + jnz L(Exit14) + cmp $14, %r8 + je L(StrncatExit14) + test $0x40, %ah + jnz L(Exit15) + cmp $15, %r8 + je L(StrncatExit15) + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movlpd 8(%rcx), %xmm1 + movlpd %xmm1, 8(%rdx) + mov %rdi, %rax + ret + +L(CopyFrom1To16BytesCase2OrCase3): + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + + .p2align 4 +L(CopyFrom1To16BytesCase3): + add $16, %r8 + add %rsi, %rdx + add %rsi, %rcx + + cmp $8, %r8 + ja L(ExitHighCase3) + cmp $1, %r8 + je L(StrncatExit1) + cmp $2, %r8 + je L(StrncatExit2) + cmp $3, %r8 + je L(StrncatExit3) + cmp $4, %r8 + je L(StrncatExit4) + cmp $5, %r8 + je L(StrncatExit5) + cmp $6, %r8 + je L(StrncatExit6) + cmp $7, %r8 + je L(StrncatExit7) + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + xor %ah, %ah + movb %ah, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(ExitHighCase3): + cmp $9, %r8 + je L(StrncatExit9) + cmp $10, %r8 + je L(StrncatExit10) + cmp $11, %r8 + je L(StrncatExit11) + cmp $12, %r8 + je L(StrncatExit12) + cmp $13, %r8 + je L(StrncatExit13) + cmp $14, %r8 + je L(StrncatExit14) + cmp $15, %r8 + je L(StrncatExit15) + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movlpd 8(%rcx), %xmm1 + movlpd %xmm1, 8(%rdx) + xor %ah, %ah + movb %ah, 16(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit0): + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit15Bytes): + cmp $9, %r8 + je L(StrncatExit9) + cmpb $0, 9(%rcx) + jz L(Exit10) + cmp $10, %r8 + je L(StrncatExit10) + cmpb $0, 10(%rcx) + jz L(Exit11) + cmp $11, %r8 + je L(StrncatExit11) + cmpb $0, 11(%rcx) + jz L(Exit12) + cmp $12, %r8 + je L(StrncatExit12) + cmpb $0, 12(%rcx) + jz L(Exit13) + cmp $13, %r8 + je L(StrncatExit13) + cmpb $0, 13(%rcx) + jz L(Exit14) + cmp $14, %r8 + je L(StrncatExit14) + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movlpd 7(%rcx), %xmm1 + movlpd %xmm1, 7(%rdx) + lea 14(%rdx), %rax + cmpb $1, (%rax) + sbb $-1, %rax + xor %cl, %cl + movb %cl, (%rax) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit8Bytes): + cmpb $0, (%rcx) + jz L(Exit1) + cmp $1, %r8 + je L(StrncatExit1) + cmpb $0, 1(%rcx) + jz L(Exit2) + cmp $2, %r8 + je L(StrncatExit2) + cmpb $0, 2(%rcx) + jz L(Exit3) + cmp $3, %r8 + je L(StrncatExit3) + cmpb $0, 3(%rcx) + jz L(Exit4) + cmp $4, %r8 + je L(StrncatExit4) + cmpb $0, 4(%rcx) + jz L(Exit5) + cmp $5, %r8 + je L(StrncatExit5) + cmpb $0, 5(%rcx) + jz L(Exit6) + cmp $6, %r8 + je L(StrncatExit6) + cmpb $0, 6(%rcx) + jz L(Exit7) + cmp $7, %r8 + je L(StrncatExit7) + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + lea 7(%rdx), %rax + cmpb $1, (%rax) + sbb $-1, %rax + xor %cl, %cl + movb %cl, (%rax) + mov %rdi, %rax + ret + +# endif +END (STRCAT) +#endif + diff --git a/sysdeps/x86_64/multiarch/strcat.S b/sysdeps/x86_64/multiarch/strcat.S new file mode 100644 index 0000000..f3ccc8e --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcat.S @@ -0,0 +1,85 @@ +/* Multiple versions of strcat + Copyright (C) 2009, 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +#ifndef USE_AS_STRNCAT +# ifndef STRCAT +# define STRCAT strcat +# endif +#endif + +#ifdef USE_AS_STRNCAT +# define STRCAT_SSSE3 __strncat_ssse3 +# define STRCAT_SSE2 __strncat_sse2 +# define STRCAT_SSE2_UNALIGNED __strncat_sse2_unaligned +# define __GI_STRCAT __GI_strncat +# define __GI___STRCAT __GI___strncat +#else +# define STRCAT_SSSE3 __strcat_ssse3 +# define STRCAT_SSE2 __strcat_sse2 +# define STRCAT_SSE2_UNALIGNED __strcat_sse2_unaligned +# define __GI_STRCAT __GI_strcat +# define __GI___STRCAT __GI___strcat +#endif + + +/* Define multiple versions only for the definition in libc. */ +#ifndef NOT_IN_libc + .text +ENTRY(STRCAT) + .type STRCAT, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq STRCAT_SSE2_UNALIGNED(%rip), %rax + testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip) + jnz 2f + leaq STRCAT_SSE2(%rip), %rax + testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + jz 2f + leaq STRCAT_SSSE3(%rip), %rax +2: ret +END(STRCAT) + +# undef ENTRY +# define ENTRY(name) \ + .type STRCAT_SSE2, @function; \ + .align 16; \ + STRCAT_SSE2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCAT_SSE2, .-STRCAT_SSE2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcat calls through a PLT. + The speedup we get from using SSSE3 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCAT; __GI_STRCAT = STRCAT_SSE2 +# undef libc_hidden_def +# define libc_hidden_def(name) \ + .globl __GI___STRCAT; __GI___STRCAT = STRCAT_SSE2 +#endif + +#ifndef USE_AS_STRNCAT +# include "../strcat.S" +#endif diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S index 9a8d186..6de8c47 100644 --- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S @@ -20,10 +20,13 @@ #ifndef NOT_IN_libc -# include +# ifndef USE_AS_STRCAT +# include + +# ifndef STRCPY +# define STRCPY __strcpy_sse2_unaligned +# endif -# ifndef STRCPY -# define STRCPY __strcpy_sse2_unaligned # endif # define JMPTBL(I, B) I - B @@ -33,16 +36,20 @@ lea (%r11, %rcx), %rcx; \ jmp *%rcx - .text +# ifndef USE_AS_STRCAT + +.text ENTRY (STRCPY) -# ifdef USE_AS_STRNCPY +# ifdef USE_AS_STRNCPY mov %rdx, %r8 test %r8, %r8 jz L(ExitZero) -# endif +# endif mov %rsi, %rcx -# ifndef USE_AS_STPCPY +# ifndef USE_AS_STPCPY mov %rdi, %rax /* save result */ +# endif + # endif and $15, %rcx @@ -59,7 +66,7 @@ ENTRY (STRCPY) pmovmskb %xmm1, %rdx shr %cl, %rdx # ifdef USE_AS_STRNCPY -# if defined USE_AS_STPCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT cmp $16, %r8 # else cmp $17, %r8 @@ -72,7 +79,7 @@ ENTRY (STRCPY) pcmpeqb 16(%rsi), %xmm0 pmovmskb %xmm0, %rdx # ifdef USE_AS_STRNCPY -# if defined USE_AS_STPCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT cmp $32, %r8 # else cmp $33, %r8 @@ -102,7 +109,7 @@ L(Unalign16Both): jbe L(CopyFrom1To16BytesCase2OrCase3) # endif test %rdx, %rdx -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT jnz L(CopyFrom1To16BytesUnalignedXmm2) # else jnz L(CopyFrom1To16Bytes) @@ -118,7 +125,7 @@ L(Unalign16Both): jbe L(CopyFrom1To16BytesCase2OrCase3) # endif test %rdx, %rdx -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT jnz L(CopyFrom1To16BytesUnalignedXmm3) # else jnz L(CopyFrom1To16Bytes) @@ -134,7 +141,7 @@ L(Unalign16Both): jbe L(CopyFrom1To16BytesCase2OrCase3) # endif test %rdx, %rdx -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT jnz L(CopyFrom1To16BytesUnalignedXmm4) # else jnz L(CopyFrom1To16Bytes) @@ -150,7 +157,7 @@ L(Unalign16Both): jbe L(CopyFrom1To16BytesCase2OrCase3) # endif test %rdx, %rdx -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT jnz L(CopyFrom1To16BytesUnalignedXmm1) # else jnz L(CopyFrom1To16Bytes) @@ -166,7 +173,7 @@ L(Unalign16Both): jbe L(CopyFrom1To16BytesCase2OrCase3) # endif test %rdx, %rdx -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT jnz L(CopyFrom1To16BytesUnalignedXmm2) # else jnz L(CopyFrom1To16Bytes) @@ -182,7 +189,7 @@ L(Unalign16Both): jbe L(CopyFrom1To16BytesCase2OrCase3) # endif test %rdx, %rdx -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT jnz L(CopyFrom1To16BytesUnalignedXmm3) # else jnz L(CopyFrom1To16Bytes) @@ -264,10 +271,10 @@ L(Unaligned64Leave): movdqu %xmm4, (%rdi) movdqu %xmm5, 16(%rdi) movdqu %xmm6, 32(%rdi) -# if defined USE_AS_STRNCPY -# ifdef USE_AS_STPCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY lea 48(%rdi, %rdx), %rax -# endif +# endif movdqu %xmm7, 48(%rdi) add $15, %r8 sub %rdx, %r8 @@ -288,7 +295,7 @@ L(SourceStringAlignmentZero): pmovmskb %xmm0, %rdx # ifdef USE_AS_STRNCPY -# if defined USE_AS_STPCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT cmp $16, %r8 # else cmp $17, %r8 @@ -303,7 +310,7 @@ L(SourceStringAlignmentZero): pmovmskb %xmm0, %rdx # ifdef USE_AS_STRNCPY -# if defined USE_AS_STPCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT cmp $32, %r8 # else cmp $33, %r8 @@ -314,11 +321,11 @@ L(SourceStringAlignmentZero): jnz L(CopyFrom1To32Bytes1) jmp L(Unalign16Both) -/* ------End of main part with loops--------------------- */ +/*------End of main part with loops---------------------*/ /* Case1 */ -# if (!defined USE_AS_STRNCPY) +# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) .p2align 4 L(CopyFrom1To16Bytes): add %rcx, %rdi @@ -328,7 +335,7 @@ L(CopyFrom1To16Bytes): # endif .p2align 4 L(CopyFrom1To16BytesTail): -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub %rcx, %r8 # endif add %rcx, %rsi @@ -339,7 +346,7 @@ L(CopyFrom1To16BytesTail): L(CopyFrom1To32Bytes1): add $16, %rsi add $16, %rdi -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $16, %r8 # endif L(CopyFrom1To16BytesTail1): @@ -348,7 +355,7 @@ L(CopyFrom1To16BytesTail1): .p2align 4 L(CopyFrom1To32Bytes): -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub %rcx, %r8 # endif bsf %rdx, %rdx @@ -360,10 +367,10 @@ L(CopyFrom1To32Bytes): .p2align 4 L(CopyFrom1To16BytesUnaligned_0): bsf %rdx, %rdx -# if defined USE_AS_STRNCPY -# ifdef USE_AS_STPCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY lea (%rdi, %rdx), %rax -# endif +# endif movdqu %xmm4, (%rdi) add $63, %r8 sub %rdx, %r8 @@ -377,10 +384,10 @@ L(CopyFrom1To16BytesUnaligned_0): L(CopyFrom1To16BytesUnaligned_16): bsf %rcx, %rdx movdqu %xmm4, (%rdi) -# if defined USE_AS_STRNCPY -# ifdef USE_AS_STPCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY lea 16(%rdi, %rdx), %rax -# endif +# endif movdqu %xmm5, 16(%rdi) add $47, %r8 sub %rdx, %r8 @@ -397,10 +404,10 @@ L(CopyFrom1To16BytesUnaligned_32): bsf %rdx, %rdx movdqu %xmm4, (%rdi) movdqu %xmm5, 16(%rdi) -# if defined USE_AS_STRNCPY -# ifdef USE_AS_STPCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY lea 32(%rdi, %rdx), %rax -# endif +# endif movdqu %xmm6, 32(%rdi) add $31, %r8 sub %rdx, %r8 @@ -413,6 +420,7 @@ L(CopyFrom1To16BytesUnaligned_32): # endif # ifdef USE_AS_STRNCPY +# ifndef USE_AS_STRCAT .p2align 4 L(CopyFrom1To16BytesUnalignedXmm6): movdqu %xmm6, (%rdi, %rcx) @@ -437,6 +445,7 @@ L(CopyFrom1To16BytesUnalignedXmm3): L(CopyFrom1To16BytesUnalignedXmm1): movdqu %xmm1, (%rdi, %rcx) jmp L(CopyFrom1To16BytesXmmExit) +# endif .p2align 4 L(CopyFrom1To16BytesExit): @@ -519,7 +528,7 @@ L(CopyFrom1To16BytesTail1Case2OrCase3): # endif -/* ----End labels regarding with copying 1-16 bytes--and 1-32 bytes---- */ +/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/ .p2align 4 L(Exit1): @@ -527,7 +536,7 @@ L(Exit1): # ifdef USE_AS_STPCPY lea (%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $1, %r8 lea 1(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -541,7 +550,7 @@ L(Exit2): # ifdef USE_AS_STPCPY lea 1(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $2, %r8 lea 2(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -556,7 +565,7 @@ L(Exit3): # ifdef USE_AS_STPCPY lea 2(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $3, %r8 lea 3(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -570,7 +579,7 @@ L(Exit4): # ifdef USE_AS_STPCPY lea 3(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $4, %r8 lea 4(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -585,7 +594,7 @@ L(Exit5): # ifdef USE_AS_STPCPY lea 4(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $5, %r8 lea 5(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -601,7 +610,7 @@ L(Exit6): # ifdef USE_AS_STPCPY lea 5(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $6, %r8 lea 6(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -617,7 +626,7 @@ L(Exit7): # ifdef USE_AS_STPCPY lea 6(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $7, %r8 lea 7(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -631,7 +640,7 @@ L(Exit8): # ifdef USE_AS_STPCPY lea 7(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $8, %r8 lea 8(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -646,7 +655,7 @@ L(Exit9): # ifdef USE_AS_STPCPY lea 8(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $9, %r8 lea 9(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -662,7 +671,7 @@ L(Exit10): # ifdef USE_AS_STPCPY lea 9(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $10, %r8 lea 10(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -678,7 +687,7 @@ L(Exit11): # ifdef USE_AS_STPCPY lea 10(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $11, %r8 lea 11(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -694,7 +703,7 @@ L(Exit12): # ifdef USE_AS_STPCPY lea 11(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $12, %r8 lea 12(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -710,7 +719,7 @@ L(Exit13): # ifdef USE_AS_STPCPY lea 12(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $13, %r8 lea 13(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -726,7 +735,7 @@ L(Exit14): # ifdef USE_AS_STPCPY lea 13(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $14, %r8 lea 14(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -742,7 +751,7 @@ L(Exit15): # ifdef USE_AS_STPCPY lea 14(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $15, %r8 lea 15(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -756,7 +765,7 @@ L(Exit16): # ifdef USE_AS_STPCPY lea 15(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $16, %r8 lea 16(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -771,7 +780,7 @@ L(Exit17): # ifdef USE_AS_STPCPY lea 16(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $17, %r8 lea 17(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -787,7 +796,7 @@ L(Exit18): # ifdef USE_AS_STPCPY lea 17(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $18, %r8 lea 18(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -803,7 +812,7 @@ L(Exit19): # ifdef USE_AS_STPCPY lea 18(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $19, %r8 lea 19(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -819,7 +828,7 @@ L(Exit20): # ifdef USE_AS_STPCPY lea 19(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $20, %r8 lea 20(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -836,7 +845,7 @@ L(Exit21): # ifdef USE_AS_STPCPY lea 20(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $21, %r8 lea 21(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -852,7 +861,7 @@ L(Exit22): # ifdef USE_AS_STPCPY lea 21(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $22, %r8 lea 22(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -868,7 +877,7 @@ L(Exit23): # ifdef USE_AS_STPCPY lea 22(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $23, %r8 lea 23(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -884,7 +893,7 @@ L(Exit24): # ifdef USE_AS_STPCPY lea 23(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $24, %r8 lea 24(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -901,7 +910,7 @@ L(Exit25): # ifdef USE_AS_STPCPY lea 24(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $25, %r8 lea 25(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -919,7 +928,7 @@ L(Exit26): # ifdef USE_AS_STPCPY lea 25(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $26, %r8 lea 26(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -937,7 +946,7 @@ L(Exit27): # ifdef USE_AS_STPCPY lea 26(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $27, %r8 lea 27(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -955,7 +964,7 @@ L(Exit28): # ifdef USE_AS_STPCPY lea 27(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $28, %r8 lea 28(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -971,7 +980,7 @@ L(Exit29): # ifdef USE_AS_STPCPY lea 28(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $29, %r8 lea 29(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -987,7 +996,7 @@ L(Exit30): # ifdef USE_AS_STPCPY lea 29(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $30, %r8 lea 30(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -1003,7 +1012,7 @@ L(Exit31): # ifdef USE_AS_STPCPY lea 30(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $31, %r8 lea 31(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -1019,7 +1028,7 @@ L(Exit32): # ifdef USE_AS_STPCPY lea 31(%rdi), %rax # endif -# if defined USE_AS_STRNCPY +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT sub $32, %r8 lea 32(%rdi), %rdi jnz L(StrncpyFillTailWithZero) @@ -1030,27 +1039,39 @@ L(Exit32): .p2align 4 L(StrncpyExit0): -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY mov %rdi, %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, (%rdi) +# endif ret .p2align 4 L(StrncpyExit1): mov (%rsi), %dl mov %dl, (%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 1(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 1(%rdi) +# endif ret .p2align 4 L(StrncpyExit2): mov (%rsi), %dx mov %dx, (%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 2(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 2(%rdi) +# endif ret .p2align 4 @@ -1059,18 +1080,26 @@ L(StrncpyExit3): mov 2(%rsi), %dl mov %cx, (%rdi) mov %dl, 2(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 3(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 3(%rdi) +# endif ret .p2align 4 L(StrncpyExit4): mov (%rsi), %edx mov %edx, (%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 4(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 4(%rdi) +# endif ret .p2align 4 @@ -1079,9 +1108,13 @@ L(StrncpyExit5): mov 4(%rsi), %dl mov %ecx, (%rdi) mov %dl, 4(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 5(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 5(%rdi) +# endif ret .p2align 4 @@ -1090,9 +1123,13 @@ L(StrncpyExit6): mov 4(%rsi), %dx mov %ecx, (%rdi) mov %dx, 4(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 6(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 6(%rdi) +# endif ret .p2align 4 @@ -1101,18 +1138,26 @@ L(StrncpyExit7): mov 3(%rsi), %edx mov %ecx, (%rdi) mov %edx, 3(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 7(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 7(%rdi) +# endif ret .p2align 4 L(StrncpyExit8): mov (%rsi), %rdx mov %rdx, (%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 8(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 8(%rdi) +# endif ret .p2align 4 @@ -1121,9 +1166,13 @@ L(StrncpyExit9): mov 8(%rsi), %dl mov %rcx, (%rdi) mov %dl, 8(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 9(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 9(%rdi) +# endif ret .p2align 4 @@ -1132,9 +1181,13 @@ L(StrncpyExit10): mov 8(%rsi), %dx mov %rcx, (%rdi) mov %dx, 8(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 10(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 10(%rdi) +# endif ret .p2align 4 @@ -1143,9 +1196,13 @@ L(StrncpyExit11): mov 7(%rsi), %edx mov %rcx, (%rdi) mov %edx, 7(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 11(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 11(%rdi) +# endif ret .p2align 4 @@ -1154,9 +1211,13 @@ L(StrncpyExit12): mov 8(%rsi), %edx mov %rcx, (%rdi) mov %edx, 8(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 12(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 12(%rdi) +# endif ret .p2align 4 @@ -1165,9 +1226,13 @@ L(StrncpyExit13): mov 5(%rsi), %rdx mov %rcx, (%rdi) mov %rdx, 5(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 13(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 13(%rdi) +# endif ret .p2align 4 @@ -1176,9 +1241,13 @@ L(StrncpyExit14): mov 6(%rsi), %rdx mov %rcx, (%rdi) mov %rdx, 6(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 14(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 14(%rdi) +# endif ret .p2align 4 @@ -1187,18 +1256,26 @@ L(StrncpyExit15): mov 7(%rsi), %rdx mov %rcx, (%rdi) mov %rdx, 7(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 15(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 15(%rdi) +# endif ret .p2align 4 L(StrncpyExit16): movdqu (%rsi), %xmm0 movdqu %xmm0, (%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 16(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 16(%rdi) +# endif ret .p2align 4 @@ -1207,9 +1284,13 @@ L(StrncpyExit17): mov 16(%rsi), %cl movdqu %xmm0, (%rdi) mov %cl, 16(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 17(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 17(%rdi) +# endif ret .p2align 4 @@ -1218,9 +1299,13 @@ L(StrncpyExit18): mov 16(%rsi), %cx movdqu %xmm0, (%rdi) mov %cx, 16(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 18(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 18(%rdi) +# endif ret .p2align 4 @@ -1229,9 +1314,13 @@ L(StrncpyExit19): mov 15(%rsi), %ecx movdqu %xmm0, (%rdi) mov %ecx, 15(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 19(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 19(%rdi) +# endif ret .p2align 4 @@ -1240,9 +1329,13 @@ L(StrncpyExit20): mov 16(%rsi), %ecx movdqu %xmm0, (%rdi) mov %ecx, 16(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 20(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 20(%rdi) +# endif ret .p2align 4 @@ -1253,9 +1346,13 @@ L(StrncpyExit21): movdqu %xmm0, (%rdi) mov %ecx, 16(%rdi) mov %dl, 20(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 21(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 21(%rdi) +# endif ret .p2align 4 @@ -1264,9 +1361,13 @@ L(StrncpyExit22): mov 14(%rsi), %rcx movdqu %xmm0, (%rdi) mov %rcx, 14(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 22(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 22(%rdi) +# endif ret .p2align 4 @@ -1275,9 +1376,13 @@ L(StrncpyExit23): mov 15(%rsi), %rcx movdqu %xmm0, (%rdi) mov %rcx, 15(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 23(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 23(%rdi) +# endif ret .p2align 4 @@ -1286,9 +1391,13 @@ L(StrncpyExit24): mov 16(%rsi), %rcx movdqu %xmm0, (%rdi) mov %rcx, 16(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 24(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 24(%rdi) +# endif ret .p2align 4 @@ -1299,9 +1408,13 @@ L(StrncpyExit25): movdqu %xmm0, (%rdi) mov %rdx, 16(%rdi) mov %cl, 24(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 25(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 25(%rdi) +# endif ret .p2align 4 @@ -1312,9 +1425,13 @@ L(StrncpyExit26): movdqu %xmm0, (%rdi) mov %rdx, 16(%rdi) mov %cx, 24(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 26(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 26(%rdi) +# endif ret .p2align 4 @@ -1325,9 +1442,13 @@ L(StrncpyExit27): movdqu %xmm0, (%rdi) mov %rdx, 16(%rdi) mov %ecx, 23(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 27(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 27(%rdi) +# endif ret .p2align 4 @@ -1338,9 +1459,13 @@ L(StrncpyExit28): movdqu %xmm0, (%rdi) mov %rdx, 16(%rdi) mov %ecx, 24(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 28(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 28(%rdi) +# endif ret .p2align 4 @@ -1349,9 +1474,13 @@ L(StrncpyExit29): movdqu 13(%rsi), %xmm2 movdqu %xmm0, (%rdi) movdqu %xmm2, 13(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 29(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 29(%rdi) +# endif ret .p2align 4 @@ -1360,9 +1489,13 @@ L(StrncpyExit30): movdqu 14(%rsi), %xmm2 movdqu %xmm0, (%rdi) movdqu %xmm2, 14(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 30(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 30(%rdi) +# endif ret .p2align 4 @@ -1371,9 +1504,13 @@ L(StrncpyExit31): movdqu 15(%rsi), %xmm2 movdqu %xmm0, (%rdi) movdqu %xmm2, 15(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 31(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 31(%rdi) +# endif ret .p2align 4 @@ -1382,9 +1519,13 @@ L(StrncpyExit32): movdqu 16(%rsi), %xmm2 movdqu %xmm0, (%rdi) movdqu %xmm2, 16(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 32(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 32(%rdi) +# endif ret .p2align 4 @@ -1395,8 +1536,14 @@ L(StrncpyExit33): movdqu %xmm0, (%rdi) movdqu %xmm2, 16(%rdi) mov %cl, 32(%rdi) +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 33(%rdi) +# endif ret +# ifndef USE_AS_STRCAT + .p2align 4 L(Fill0): ret @@ -1498,9 +1645,9 @@ L(CopyFrom1To16BytesXmmExit): bsf %rdx, %rdx add $15, %r8 add %rcx, %rdi -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea (%rdi, %rdx), %rax -# endif +# endif sub %rdx, %r8 lea 1(%rdi, %rdx), %rdi @@ -1553,6 +1700,9 @@ L(StrncpyFillExit): add $16, %r8 BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4) +/* end of ifndef USE_AS_STRCAT */ +# endif + .p2align 4 L(UnalignedLeaveCase2OrCase3): test %rdx, %rdx @@ -1572,9 +1722,13 @@ L(Unaligned64LeaveCase3): sub $16, %r8 jb L(CopyFrom1To16BytesCase3) movdqu %xmm7, 48(%rdi) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 64(%rdi), %rax -# endif +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 64(%rdi) +# endif ret .p2align 4 @@ -1585,8 +1739,11 @@ L(Unaligned64LeaveCase2): add $48, %r8 jle L(CopyFrom1To16BytesCase2OrCase3) test %rdx, %rdx +# ifndef USE_AS_STRCAT jnz L(CopyFrom1To16BytesUnalignedXmm4) - +# else + jnz L(CopyFrom1To16Bytes) +# endif pcmpeqb %xmm5, %xmm0 pmovmskb %xmm0, %rdx movdqu %xmm4, (%rdi) @@ -1594,7 +1751,11 @@ L(Unaligned64LeaveCase2): sub $16, %r8 jbe L(CopyFrom1To16BytesCase2OrCase3) test %rdx, %rdx +# ifndef USE_AS_STRCAT jnz L(CopyFrom1To16BytesUnalignedXmm5) +# else + jnz L(CopyFrom1To16Bytes) +# endif pcmpeqb %xmm6, %xmm0 pmovmskb %xmm0, %rdx @@ -1603,7 +1764,11 @@ L(Unaligned64LeaveCase2): sub $16, %r8 jbe L(CopyFrom1To16BytesCase2OrCase3) test %rdx, %rdx +# ifndef USE_AS_STRCAT jnz L(CopyFrom1To16BytesUnalignedXmm6) +# else + jnz L(CopyFrom1To16Bytes) +# endif pcmpeqb %xmm7, %xmm0 pmovmskb %xmm0, %rdx @@ -1617,13 +1782,18 @@ L(Unaligned64LeaveCase2): .p2align 4 L(ExitZero): +# ifndef USE_AS_STRCAT mov %rdi, %rax +# endif ret # endif +# ifndef USE_AS_STRCAT END (STRCPY) - +# else +END (STRCAT) +# endif .p2align 4 .section .rodata L(ExitTable): @@ -1695,6 +1865,7 @@ L(ExitStrncpyTable): .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable)) .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable)) .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable)) +# ifndef USE_AS_STRCAT .p2align 4 L(FillTable): .int JMPTBL(L(Fill0), L(FillTable)) @@ -1714,5 +1885,7 @@ L(FillTable): .int JMPTBL(L(Fill14), L(FillTable)) .int JMPTBL(L(Fill15), L(FillTable)) .int JMPTBL(L(Fill16), L(FillTable)) +# endif # endif #endif + diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S index efbd3bf..05faf0d 100644 --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S +++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S @@ -20,25 +20,26 @@ #ifndef NOT_IN_libc -# include +# ifndef USE_AS_STRCAT +# include -# ifndef STRCPY -# define STRCPY __strcpy_ssse3 -# endif +# ifndef STRCPY +# define STRCPY __strcpy_ssse3 +# endif .section .text.ssse3,"ax",@progbits ENTRY (STRCPY) mov %rsi, %rcx -# ifdef USE_AS_STRNCPY +# ifdef USE_AS_STRNCPY mov %rdx, %r8 -# endif +# endif mov %rdi, %rdx -# ifdef USE_AS_STRNCPY +# ifdef USE_AS_STRNCPY test %r8, %r8 jz L(Exit0) cmp $8, %r8 jbe L(StrncpyExit8Bytes) -# endif +# endif cmpb $0, (%rcx) jz L(Exit1) cmpb $0, 1(%rcx) @@ -55,10 +56,10 @@ ENTRY (STRCPY) jz L(Exit7) cmpb $0, 7(%rcx) jz L(Exit8) -# ifdef USE_AS_STRNCPY +# ifdef USE_AS_STRNCPY cmp $16, %r8 jb L(StrncpyExit15Bytes) -# endif +# endif cmpb $0, 8(%rcx) jz L(Exit9) cmpb $0, 9(%rcx) @@ -73,12 +74,13 @@ ENTRY (STRCPY) jz L(Exit14) cmpb $0, 14(%rcx) jz L(Exit15) -# ifdef USE_AS_STRNCPY +# ifdef USE_AS_STRNCPY cmp $16, %r8 je L(Exit16) -# endif +# endif cmpb $0, 15(%rcx) jz L(Exit16) +# endif # ifdef USE_AS_STRNCPY mov %rcx, %rsi @@ -2180,12 +2182,12 @@ L(Shl15LoopExit): jmp L(CopyFrom1To16Bytes) # endif - +# ifndef USE_AS_STRCAT .p2align 4 L(CopyFrom1To16Bytes): -# ifdef USE_AS_STRNCPY +# ifdef USE_AS_STRNCPY add $16, %r8 -# endif +# endif add %rsi, %rdx add %rsi, %rcx @@ -2210,20 +2212,20 @@ L(CopyFrom1To16Bytes): L(Exit8): mov (%rcx), %rax mov %rax, (%rdx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 7(%rdx), %rax -# else +# else mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $8, %r8 lea 8(%rdx), %rcx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax -# endif -# endif +# endif +# endif ret .p2align 4 @@ -2249,23 +2251,23 @@ L(Exit16): mov %rax, (%rdx) mov 8(%rcx), %rax mov %rax, 8(%rdx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 15(%rdx), %rax -# else +# else mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $16, %r8 lea 16(%rdx), %rcx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax -# endif -# endif +# endif +# endif ret -# ifdef USE_AS_STRNCPY +# ifdef USE_AS_STRNCPY .p2align 4 L(CopyFrom1To16BytesCase2): @@ -2381,46 +2383,46 @@ L(Less12Case3): /* but more than 8 */ jl L(Exit9) je L(Exit10) jg L(Exit11) -# endif +# endif .p2align 4 L(Exit1): movb (%rcx), %al movb %al, (%rdx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea (%rdx), %rax -# else +# else mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $1, %r8 lea 1(%rdx), %rcx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax -# endif -# endif +# endif +# endif ret .p2align 4 L(Exit2): movw (%rcx), %ax movw %ax, (%rdx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 1(%rdx), %rax -# else +# else mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $2, %r8 lea 2(%rdx), %rcx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax -# endif -# endif +# endif +# endif ret .p2align 4 @@ -2429,40 +2431,40 @@ L(Exit3): movw %ax, (%rdx) movb 2(%rcx), %al movb %al, 2(%rdx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 2(%rdx), %rax -# else +# else mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $3, %r8 lea 3(%rdx), %rcx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax -# endif -# endif +# endif +# endif ret .p2align 4 L(Exit4): movl (%rcx), %eax movl %eax, (%rdx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 3(%rdx), %rax -# else +# else mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $4, %r8 lea 4(%rdx), %rcx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax -# endif -# endif +# endif +# endif ret .p2align 4 @@ -2471,20 +2473,20 @@ L(Exit5): movl %eax, (%rdx) movb 4(%rcx), %al movb %al, 4(%rdx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 4(%rdx), %rax -# else +# else mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $5, %r8 lea 5(%rdx), %rcx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax -# endif -# endif +# endif +# endif ret .p2align 4 @@ -2493,20 +2495,20 @@ L(Exit6): movl %eax, (%rdx) movw 4(%rcx), %ax movw %ax, 4(%rdx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 5(%rdx), %rax -# else +# else mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $6, %r8 lea 6(%rdx), %rcx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax -# endif -# endif +# endif +# endif ret .p2align 4 @@ -2515,20 +2517,20 @@ L(Exit7): movl %eax, (%rdx) movl 3(%rcx), %eax movl %eax, 3(%rdx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 6(%rdx), %rax -# else +# else mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $7, %r8 lea 7(%rdx), %rcx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax -# endif -# endif +# endif +# endif ret .p2align 4 @@ -2537,20 +2539,20 @@ L(Exit9): mov %rax, (%rdx) mov 5(%rcx), %eax mov %eax, 5(%rdx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 8(%rdx), %rax -# else +# else mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $9, %r8 lea 9(%rdx), %rcx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax -# endif -# endif +# endif +# endif ret .p2align 4 @@ -2559,20 +2561,20 @@ L(Exit10): mov %rax, (%rdx) mov 6(%rcx), %eax mov %eax, 6(%rdx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 9(%rdx), %rax -# else +# else mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $10, %r8 lea 10(%rdx), %rcx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax -# endif -# endif +# endif +# endif ret .p2align 4 @@ -2581,20 +2583,20 @@ L(Exit11): mov %rax, (%rdx) mov 7(%rcx), %eax mov %eax, 7(%rdx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 10(%rdx), %rax -# else +# else mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $11, %r8 lea 11(%rdx), %rcx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax -# endif -# endif +# endif +# endif ret .p2align 4 @@ -2603,20 +2605,20 @@ L(Exit12): mov %rax, (%rdx) mov 8(%rcx), %eax mov %eax, 8(%rdx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 11(%rdx), %rax -# else +# else mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $12, %r8 lea 12(%rdx), %rcx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax -# endif -# endif +# endif +# endif ret .p2align 4 @@ -2625,20 +2627,20 @@ L(Exit13): mov %rax, (%rdx) mov 5(%rcx), %rax mov %rax, 5(%rdx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 12(%rdx), %rax -# else +# else mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $13, %r8 lea 13(%rdx), %rcx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax -# endif -# endif +# endif +# endif ret .p2align 4 @@ -2647,20 +2649,20 @@ L(Exit14): mov %rax, (%rdx) mov 6(%rcx), %rax mov %rax, 6(%rdx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 13(%rdx), %rax -# else +# else mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $14, %r8 lea 14(%rdx), %rcx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax -# endif -# endif +# endif +# endif ret .p2align 4 @@ -2669,23 +2671,23 @@ L(Exit15): mov %rax, (%rdx) mov 7(%rcx), %rax mov %rax, 7(%rdx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 14(%rdx), %rax -# else +# else mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY +# endif +# ifdef USE_AS_STRNCPY sub $15, %r8 lea 15(%rdx), %rcx jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax -# endif -# endif +# endif +# endif ret -# ifdef USE_AS_STRNCPY +# ifdef USE_AS_STRNCPY .p2align 4 L(Fill0): ret @@ -2902,13 +2904,13 @@ L(StrncpyExit15Bytes): mov %rax, (%rdx) mov 7(%rcx), %rax mov %rax, 7(%rdx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 14(%rdx), %rax cmpb $1, (%rax) sbb $-1, %rax -# else +# else mov %rdi, %rax -# endif +# endif ret .p2align 4 @@ -2943,15 +2945,17 @@ L(StrncpyExit8Bytes): jz L(Exit7) mov (%rcx), %rax mov %rax, (%rdx) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY lea 7(%rdx), %rax cmpb $1, (%rax) sbb $-1, %rax -# else +# else mov %rdi, %rax -# endif +# endif ret +# endif + # endif # ifdef USE_AS_STRNCPY @@ -3715,7 +3719,7 @@ L(StrncpyExit15): lea 1(%rsi), %rsi jmp L(CopyFrom1To16BytesCase3) # endif - +# ifndef USE_AS_STRCAT END (STRCPY) - +# endif #endif diff --git a/sysdeps/x86_64/multiarch/strlen-no-bsf.S b/sysdeps/x86_64/multiarch/strlen-no-bsf.S index 3e52f81..c730e0a 100644 --- a/sysdeps/x86_64/multiarch/strlen-no-bsf.S +++ b/sysdeps/x86_64/multiarch/strlen-no-bsf.S @@ -1,5 +1,5 @@ -/* strlen without BSF - Copyright (C) 2010 Free Software Foundation, Inc. +/* strlen SSE2 without bsf + Copyright (C) 2010, 2011 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -18,12 +18,17 @@ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ -#if defined SHARED && !defined NOT_IN_libc +#if (defined SHARED || defined USE_AS_STRCAT) && !defined NOT_IN_libc -#include +# ifndef USE_AS_STRCAT - .section .text.slow,"ax",@progbits +# include + +# define RETURN ret + + .section .text.sse2,"ax",@progbits ENTRY (__strlen_no_bsf) +# endif xor %eax, %eax cmpb $0, (%rdi) jz L(exit_tail0) @@ -165,39 +170,37 @@ ENTRY (__strlen_no_bsf) jnz L(exit) and $-0x40, %rax - xor %r8d, %r8d L(aligned_64): pcmpeqb (%rax), %xmm0 pcmpeqb 16(%rax), %xmm1 pcmpeqb 32(%rax), %xmm2 pcmpeqb 48(%rax), %xmm3 pmovmskb %xmm0, %edx - pmovmskb %xmm1, %esi - pmovmskb %xmm2, %edi + pmovmskb %xmm1, %r11d + pmovmskb %xmm2, %r10d pmovmskb %xmm3, %r9d - or %edx, %r8d - or %esi, %r8d - or %edi, %r8d - or %r9d, %r8d + or %edx, %r9d + or %r11d, %r9d + or %r10d, %r9d lea 64(%rax), %rax jz L(aligned_64) test %edx, %edx jnz L(aligned_64_exit_16) - test %esi, %esi + test %r11d, %r11d jnz L(aligned_64_exit_32) - test %edi, %edi + test %r10d, %r10d jnz L(aligned_64_exit_48) L(aligned_64_exit_64): - mov %r9d, %edx + pmovmskb %xmm3, %edx jmp L(aligned_64_exit) L(aligned_64_exit_48): lea -16(%rax), %rax - mov %edi, %edx + mov %r10d, %edx jmp L(aligned_64_exit) L(aligned_64_exit_32): lea -32(%rax), %rax - mov %esi, %edx + mov %r11d, %edx jmp L(aligned_64_exit) L(aligned_64_exit_16): lea -48(%rax), %rax @@ -228,7 +231,7 @@ L(exit): jnz L(exit_tail6) add $7, %eax L(exit_tail0): - ret + RETURN L(exit_high): add $8, %eax @@ -253,57 +256,58 @@ L(exit_high): test $0x40, %dh jnz L(exit_tail6) add $7, %eax - ret + RETURN .p2align 4 L(exit_tail1): add $1, %eax - ret + RETURN L(exit_tail2): add $2, %eax - ret + RETURN L(exit_tail3): add $3, %eax - ret + RETURN L(exit_tail4): add $4, %eax - ret + RETURN L(exit_tail5): add $5, %eax - ret + RETURN L(exit_tail6): add $6, %eax - ret + RETURN L(exit_tail7): add $7, %eax - ret + RETURN L(exit_tail8): add $8, %eax - ret + RETURN L(exit_tail9): add $9, %eax - ret + RETURN L(exit_tail10): add $10, %eax - ret + RETURN L(exit_tail11): add $11, %eax - ret + RETURN L(exit_tail12): add $12, %eax - ret + RETURN L(exit_tail13): add $13, %eax - ret + RETURN L(exit_tail14): add $14, %eax - ret + RETURN L(exit_tail15): add $15, %eax - ret +# ifndef USE_AS_STRCAT + RETURN END (__strlen_no_bsf) - +# endif #endif diff --git a/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S b/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S new file mode 100644 index 0000000..57778cf --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S @@ -0,0 +1,260 @@ +/* strlen SSE2 + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#if !defined NOT_IN_libc && (defined SHARED || defined USE_AS_STRCAT) + +# ifndef USE_AS_STRCAT + +# include + +# define RETURN ret + + .section .text.sse2,"ax",@progbits +ENTRY (__strlen_sse2_pminub) + +# endif + xor %rax, %rax + mov %edi, %ecx + and $0x3f, %ecx + pxor %xmm0, %xmm0 + cmp $0x30, %ecx + ja L(next) + movdqu (%rdi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit_less16) + mov %rdi, %rax + and $-16, %rax + jmp L(align16_start) +L(next): + mov %rdi, %rax + and $-16, %rax + pcmpeqb (%rax), %xmm0 + mov $-1, %r10d + sub %rax, %rcx + shl %cl, %r10d + pmovmskb %xmm0, %edx + and %r10d, %edx + jnz L(exit) +L(align16_start): + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pcmpeqb 16(%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + pcmpeqb 80(%rax), %xmm0 + add $64, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + pcmpeqb 80(%rax), %xmm0 + add $64, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + pcmpeqb 80(%rax), %xmm0 + add $64, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 80(%rax), %xmm0 + add $80, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit) + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 16(%rax), %xmm1 + add $16, %rax + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit) + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 16(%rax), %xmm2 + add $16, %rax + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit) + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 16(%rax), %xmm3 + add $16, %rax + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit) + + add $16, %rax + .p2align 4 + L(align64_loop): + movaps (%rax), %xmm4 + pminub 16(%rax), %xmm4 + movaps 32(%rax), %xmm5 + pminub 48(%rax), %xmm5 + add $64, %rax + pminub %xmm4, %xmm5 + pcmpeqb %xmm0, %xmm5 + pmovmskb %xmm5, %edx + test %edx, %edx + jz L(align64_loop) + + + pcmpeqb -64(%rax), %xmm0 + sub $80, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $64, %rax + RETURN + + .p2align 4 +L(exit): + sub %rdi, %rax +L(exit_less16): + bsf %rdx, %rdx + add %rdx, %rax + RETURN + .p2align 4 +L(exit16): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $16, %rax + RETURN + .p2align 4 +L(exit32): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $32, %rax + RETURN + .p2align 4 +L(exit48): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $48, %rax + RETURN + .p2align 4 +L(exit64): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $64, %rax +# ifndef USE_AS_STRCAT + RETURN + +END (__strlen_sse2_pminub) +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S index 83a88ec..d789707 100644 --- a/sysdeps/x86_64/multiarch/strlen.S +++ b/sysdeps/x86_64/multiarch/strlen.S @@ -32,7 +32,10 @@ ENTRY(strlen) cmpl $0, __cpu_features+KIND_OFFSET(%rip) jne 1f call __init_cpu_features -1: leaq __strlen_sse2(%rip), %rax +1: leaq __strlen_sse2_pminub(%rip), %rax + testl $bit_Prefer_PMINUB_for_stringop, __cpu_features+FEATURE_OFFSET+index_Prefer_PMINUB_for_stringop(%rip) + jnz 2f + leaq __strlen_sse2(%rip), %rax testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) jz 2f leaq __strlen_sse42(%rip), %rax diff --git a/sysdeps/x86_64/multiarch/strncat-c.c b/sysdeps/x86_64/multiarch/strncat-c.c new file mode 100644 index 0000000..a3cdbff --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncat-c.c @@ -0,0 +1,8 @@ +#define STRNCAT __strncat_sse2 +#ifdef SHARED +#undef libc_hidden_def +#define libc_hidden_def(name) \ + __hidden_ver1 (__strncat_sse2, __GI___strncat, __strncat_sse2); +#endif + +#include "string/strncat.c" diff --git a/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S new file mode 100644 index 0000000..133e1d2 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCAT +#define STRCAT __strncat_sse2_unaligned +#include "strcat-sse2-unaligned.S" diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S new file mode 100644 index 0000000..6c45ff3 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncat-ssse3.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCAT +#define STRCAT __strncat_ssse3 +#include "strcat-ssse3.S" diff --git a/sysdeps/x86_64/multiarch/strncat.S b/sysdeps/x86_64/multiarch/strncat.S new file mode 100644 index 0000000..fd569c2 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncat.S @@ -0,0 +1,3 @@ +#define STRCAT strncat +#define USE_AS_STRNCAT +#include "strcat.S" -- 2.7.4