From: Liubov Dmitrieva Date: Sun, 23 Oct 2011 18:56:04 +0000 (-0400) Subject: Optimized strnlen and wcscmp for x86-64 X-Git-Tag: upstream/2.20~4947 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=ce7dd29f2863820ba858fe06c2ff61417df40d75;p=platform%2Fupstream%2Flinaro-glibc.git Optimized strnlen and wcscmp for x86-64 --- diff --git a/ChangeLog b/ChangeLog index eb80349..542869b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,17 @@ +2011-10-20 Liubov Dmitrieva + + * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add + strnlen-sse2-no-bsf. + Rename strlen-no-bsf to strlen-sse2-no-bsf. + * sysdeps/x86_64/multiarch/strlen-no-bsf.S: Rename to + * sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S: + Add strnlen support. + (USE_AS_STRNLEN): New macro. + * sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S: New file. + * sysdeps/x86_64/multiarch/strcat-ssse3.S: Update. + Rename strlen-no-bsf.S to strlen-sse2-no-bsf.S + * sysdeps/x86_64/wcslen.S: New file. + 2011-10-20 Michael Zolotukhin * sysdeps/i386/i686/multiarch/memcpy-ssse3.S: Update. diff --git a/NEWS b/NEWS index 5e55ade..ad6ddc7 100644 --- a/NEWS +++ b/NEWS @@ -26,8 +26,8 @@ Version 2.15 * Improved strcpy, strncpy, stpcpy, stpncpy for SSE2 and SSSE3 on x86-64. Contributed by HJ Lu. -* Optimized strcat and strncat on x86-64 and optimized wcscmp on x86-32 and - x86-64. +* Optimized strcat, strncat, wcslen, strnlen on x86-64 and optimized + wcscmp on x86-32 and x86-64. Contributed by Liubov Dmitrieva. * Optimized strchr and strrchr for SSE on x86-32. diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index e0bb984..4cf4cf4 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -9,13 +9,13 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \ memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \ - strncase_l-ssse3 strlen-sse4 strlen-no-bsf memset-x86-64 \ + strncase_l-ssse3 strlen-sse4 strlen-sse2-no-bsf memset-x86-64 \ strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \ strcpy-sse2-unaligned strncpy-sse2-unaligned \ stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ strcat-sse2-unaligned strncat-sse2-unaligned \ strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \ - strrchr-sse2-no-bsf strchr-sse2-no-bsf \ + strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \ memcmp-ssse3 wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S index 34b61b8..2ec3ba7 100644 --- a/sysdeps/x86_64/multiarch/strcat-ssse3.S +++ b/sysdeps/x86_64/multiarch/strcat-ssse3.S @@ -35,7 +35,7 @@ ENTRY (STRCAT) # endif # define RETURN jmp L(StartStrcpyPart) -# include "strlen-no-bsf.S" +# include "strlen-sse2-no-bsf.S" # undef RETURN diff --git a/sysdeps/x86_64/multiarch/strlen-no-bsf.S b/sysdeps/x86_64/multiarch/strlen-no-bsf.S deleted file mode 100644 index a430e5f..0000000 --- a/sysdeps/x86_64/multiarch/strlen-no-bsf.S +++ /dev/null @@ -1,313 +0,0 @@ -/* strlen SSE2 without bsf - Copyright (C) 2010, 2011 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ - -#if (defined SHARED || defined USE_AS_STRCAT) && !defined NOT_IN_libc - -# ifndef USE_AS_STRCAT - -# include - -# define RETURN ret - - atom_text_section -ENTRY (__strlen_no_bsf) -# endif - xor %eax, %eax - cmpb $0, (%rdi) - jz L(exit_tail0) - cmpb $0, 1(%rdi) - jz L(exit_tail1) - cmpb $0, 2(%rdi) - jz L(exit_tail2) - cmpb $0, 3(%rdi) - jz L(exit_tail3) - cmpb $0, 4(%rdi) - jz L(exit_tail4) - cmpb $0, 5(%rdi) - jz L(exit_tail5) - cmpb $0, 6(%rdi) - jz L(exit_tail6) - cmpb $0, 7(%rdi) - jz L(exit_tail7) - cmpb $0, 8(%rdi) - jz L(exit_tail8) - cmpb $0, 9(%rdi) - jz L(exit_tail9) - cmpb $0, 10(%rdi) - jz L(exit_tail10) - cmpb $0, 11(%rdi) - jz L(exit_tail11) - cmpb $0, 12(%rdi) - jz L(exit_tail12) - cmpb $0, 13(%rdi) - jz L(exit_tail13) - cmpb $0, 14(%rdi) - jz L(exit_tail14) - cmpb $0, 15(%rdi) - jz L(exit_tail15) - pxor %xmm0, %xmm0 - mov %rdi, %rcx - mov %rdi, %rax - and $-16, %rax - add $16, %rax - add $16, %rcx - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - pxor %xmm1, %xmm1 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - pxor %xmm2, %xmm2 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - pxor %xmm3, %xmm3 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - and $-0x40, %rax -L(aligned_64): - pcmpeqb (%rax), %xmm0 - pcmpeqb 16(%rax), %xmm1 - pcmpeqb 32(%rax), %xmm2 - pcmpeqb 48(%rax), %xmm3 - pmovmskb %xmm0, %edx - pmovmskb %xmm1, %r11d - pmovmskb %xmm2, %r10d - pmovmskb %xmm3, %r9d - or %edx, %r9d - or %r11d, %r9d - or %r10d, %r9d - lea 64(%rax), %rax - jz L(aligned_64) - - test %edx, %edx - jnz L(aligned_64_exit_16) - test %r11d, %r11d - jnz L(aligned_64_exit_32) - test %r10d, %r10d - jnz L(aligned_64_exit_48) -L(aligned_64_exit_64): - pmovmskb %xmm3, %edx - jmp L(aligned_64_exit) -L(aligned_64_exit_48): - lea -16(%rax), %rax - mov %r10d, %edx - jmp L(aligned_64_exit) -L(aligned_64_exit_32): - lea -32(%rax), %rax - mov %r11d, %edx - jmp L(aligned_64_exit) -L(aligned_64_exit_16): - lea -48(%rax), %rax -L(aligned_64_exit): -L(exit): - sub %rcx, %rax - test %dl, %dl - jz L(exit_high) - test $0x01, %dl - jnz L(exit_tail0) - - test $0x02, %dl - jnz L(exit_tail1) - - test $0x04, %dl - jnz L(exit_tail2) - - test $0x08, %dl - jnz L(exit_tail3) - - test $0x10, %dl - jnz L(exit_tail4) - - test $0x20, %dl - jnz L(exit_tail5) - - test $0x40, %dl - jnz L(exit_tail6) - add $7, %eax -L(exit_tail0): - RETURN - -L(exit_high): - add $8, %eax - test $0x01, %dh - jnz L(exit_tail0) - - test $0x02, %dh - jnz L(exit_tail1) - - test $0x04, %dh - jnz L(exit_tail2) - - test $0x08, %dh - jnz L(exit_tail3) - - test $0x10, %dh - jnz L(exit_tail4) - - test $0x20, %dh - jnz L(exit_tail5) - - test $0x40, %dh - jnz L(exit_tail6) - add $7, %eax - RETURN - .p2align 4 -L(exit_tail1): - add $1, %eax - RETURN - -L(exit_tail2): - add $2, %eax - RETURN - -L(exit_tail3): - add $3, %eax - RETURN - -L(exit_tail4): - add $4, %eax - RETURN - -L(exit_tail5): - add $5, %eax - RETURN -L(exit_tail6): - add $6, %eax - RETURN -L(exit_tail7): - add $7, %eax - RETURN -L(exit_tail8): - add $8, %eax - RETURN -L(exit_tail9): - add $9, %eax - RETURN -L(exit_tail10): - add $10, %eax - RETURN -L(exit_tail11): - add $11, %eax - RETURN -L(exit_tail12): - add $12, %eax - RETURN -L(exit_tail13): - add $13, %eax - RETURN -L(exit_tail14): - add $14, %eax - RETURN -L(exit_tail15): - add $15, %eax -# ifndef USE_AS_STRCAT - RETURN -END (__strlen_no_bsf) -# endif -#endif diff --git a/sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S new file mode 100644 index 0000000..0b1c973 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S @@ -0,0 +1,686 @@ +/* strlen SSE2 without bsf + Copyright (C) 2010, 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +/* only for strlen case we don't use optimized version for STATIC build just for SHARED */ + +#if (defined SHARED || defined USE_AS_STRCAT || defined USE_AS_STRNLEN) && !defined NOT_IN_libc + +# ifndef USE_AS_STRCAT + +# include + +# define RETURN ret + +# ifndef STRLEN +# define STRLEN __strlen_sse2_no_bsf +# endif + + atom_text_section +ENTRY (STRLEN) +# endif + xor %eax, %eax +# ifdef USE_AS_STRNLEN + mov %rsi, %r8 + sub $4, %rsi + jbe L(len_less4_prolog) +# endif + cmpb $0, (%rdi) + jz L(exit_tail0) + cmpb $0, 1(%rdi) + jz L(exit_tail1) + cmpb $0, 2(%rdi) + jz L(exit_tail2) + cmpb $0, 3(%rdi) + jz L(exit_tail3) + +# ifdef USE_AS_STRNLEN + sub $4, %rsi + jbe L(len_less8_prolog) +# endif + + cmpb $0, 4(%rdi) + jz L(exit_tail4) + cmpb $0, 5(%rdi) + jz L(exit_tail5) + cmpb $0, 6(%rdi) + jz L(exit_tail6) + cmpb $0, 7(%rdi) + jz L(exit_tail7) + +# ifdef USE_AS_STRNLEN + sub $4, %rsi + jbe L(len_less12_prolog) +# endif + + cmpb $0, 8(%rdi) + jz L(exit_tail8) + cmpb $0, 9(%rdi) + jz L(exit_tail9) + cmpb $0, 10(%rdi) + jz L(exit_tail10) + cmpb $0, 11(%rdi) + jz L(exit_tail11) + +# ifdef USE_AS_STRNLEN + sub $4, %rsi + jbe L(len_less16_prolog) +# endif + + cmpb $0, 12(%rdi) + jz L(exit_tail12) + cmpb $0, 13(%rdi) + jz L(exit_tail13) + cmpb $0, 14(%rdi) + jz L(exit_tail14) + cmpb $0, 15(%rdi) + jz L(exit_tail15) + pxor %xmm0, %xmm0 + lea 16(%rdi), %rcx + lea 16(%rdi), %rax + and $-16, %rax + +# ifdef USE_AS_STRNLEN + and $15, %rdi + add %rdi, %rsi + sub $64, %rsi + jbe L(len_less64) +# endif + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + sub $64, %rsi + jbe L(len_less64) +# endif + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + sub $64, %rsi + jbe L(len_less64) +# endif + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + sub $64, %rsi + jbe L(len_less64) +# endif + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + mov %rax, %rdx + and $63, %rdx + add %rdx, %rsi +# endif + + and $-0x40, %rax + + .p2align 4 +L(aligned_64): +# ifdef USE_AS_STRNLEN + sub $64, %rsi + jbe L(len_less64) +# endif + pcmpeqb (%rax), %xmm0 + pcmpeqb 16(%rax), %xmm1 + pcmpeqb 32(%rax), %xmm2 + pcmpeqb 48(%rax), %xmm3 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %r11d + pmovmskb %xmm2, %r10d + pmovmskb %xmm3, %r9d + or %edx, %r9d + or %r11d, %r9d + or %r10d, %r9d + lea 64(%rax), %rax + jz L(aligned_64) + + test %edx, %edx + jnz L(aligned_64_exit_16) + test %r11d, %r11d + jnz L(aligned_64_exit_32) + test %r10d, %r10d + jnz L(aligned_64_exit_48) +L(aligned_64_exit_64): + pmovmskb %xmm3, %edx + jmp L(aligned_64_exit) +L(aligned_64_exit_48): + lea -16(%rax), %rax + mov %r10d, %edx + jmp L(aligned_64_exit) +L(aligned_64_exit_32): + lea -32(%rax), %rax + mov %r11d, %edx + jmp L(aligned_64_exit) +L(aligned_64_exit_16): + lea -48(%rax), %rax +L(aligned_64_exit): +L(exit): + sub %rcx, %rax + test %dl, %dl + jz L(exit_high) + test $0x01, %dl + jnz L(exit_tail0) + + test $0x02, %dl + jnz L(exit_tail1) + + test $0x04, %dl + jnz L(exit_tail2) + + test $0x08, %dl + jnz L(exit_tail3) + + test $0x10, %dl + jnz L(exit_tail4) + + test $0x20, %dl + jnz L(exit_tail5) + + test $0x40, %dl + jnz L(exit_tail6) + add $7, %eax +L(exit_tail0): + RETURN + +L(exit_high): + add $8, %eax + test $0x01, %dh + jnz L(exit_tail0) + + test $0x02, %dh + jnz L(exit_tail1) + + test $0x04, %dh + jnz L(exit_tail2) + + test $0x08, %dh + jnz L(exit_tail3) + + test $0x10, %dh + jnz L(exit_tail4) + + test $0x20, %dh + jnz L(exit_tail5) + + test $0x40, %dh + jnz L(exit_tail6) + add $7, %eax + RETURN + +# ifdef USE_AS_STRNLEN + + .p2align 4 +L(len_less64): + pxor %xmm0, %xmm0 + add $64, %rsi + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + lea 16(%rax), %rax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %rsi + jbe L(return_start_len) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + lea 16(%rax), %rax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %rsi + jbe L(return_start_len) + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + lea 16(%rax), %rax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %rsi + jbe L(return_start_len) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + lea 16(%rax), %rax + test %edx, %edx + jnz L(strnlen_exit) + + mov %r8, %rax + ret + + .p2align 4 +L(strnlen_exit): + sub %rcx, %rax + + test %dl, %dl + jz L(strnlen_exit_high) + mov %dl, %cl + and $15, %cl + jz L(strnlen_exit_8) + test $0x01, %dl + jnz L(exit_tail0) + test $0x02, %dl + jnz L(strnlen_exit_tail1) + test $0x04, %dl + jnz L(strnlen_exit_tail2) + sub $4, %rsi + jb L(return_start_len) + lea 3(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_8): + test $0x10, %dl + jnz L(strnlen_exit_tail4) + test $0x20, %dl + jnz L(strnlen_exit_tail5) + test $0x40, %dl + jnz L(strnlen_exit_tail6) + sub $8, %rsi + jb L(return_start_len) + lea 7(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_high): + mov %dh, %ch + and $15, %ch + jz L(strnlen_exit_high_8) + test $0x01, %dh + jnz L(strnlen_exit_tail8) + test $0x02, %dh + jnz L(strnlen_exit_tail9) + test $0x04, %dh + jnz L(strnlen_exit_tail10) + sub $12, %rsi + jb L(return_start_len) + lea 11(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_high_8): + test $0x10, %dh + jnz L(strnlen_exit_tail12) + test $0x20, %dh + jnz L(strnlen_exit_tail13) + test $0x40, %dh + jnz L(strnlen_exit_tail14) + sub $16, %rsi + jb L(return_start_len) + lea 15(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_tail1): + sub $2, %rsi + jb L(return_start_len) + lea 1(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_tail2): + sub $3, %rsi + jb L(return_start_len) + lea 2(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_tail4): + sub $5, %rsi + jb L(return_start_len) + lea 4(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_tail5): + sub $6, %rsi + jb L(return_start_len) + lea 5(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_tail6): + sub $7, %rsi + jb L(return_start_len) + lea 6(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_tail8): + sub $9, %rsi + jb L(return_start_len) + lea 8(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_tail9): + sub $10, %rsi + jb L(return_start_len) + lea 9(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_tail10): + sub $11, %rsi + jb L(return_start_len) + lea 10(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_tail12): + sub $13, %rsi + jb L(return_start_len) + lea 12(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_tail13): + sub $14, %rsi + jb L(return_start_len) + lea 13(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_tail14): + sub $15, %rsi + jb L(return_start_len) + lea 14(%eax), %eax + ret + + .p2align 4 +L(return_start_len): + mov %r8, %rax + ret + +/* for prolog only */ + + .p2align 4 +L(len_less4_prolog): + add $4, %rsi + jz L(exit_tail0) + + cmpb $0, (%rdi) + jz L(exit_tail0) + cmp $1, %esi + je L(exit_tail1) + + cmpb $0, 1(%rdi) + jz L(exit_tail1) + cmp $2, %esi + je L(exit_tail2) + + cmpb $0, 2(%rdi) + jz L(exit_tail2) + cmp $3, %esi + je L(exit_tail3) + + cmpb $0, 3(%rdi) + jz L(exit_tail3) + mov $4, %eax + ret + + .p2align 4 +L(len_less8_prolog): + add $4, %rsi + + cmpb $0, 4(%rdi) + jz L(exit_tail4) + cmp $1, %esi + je L(exit_tail5) + + cmpb $0, 5(%rdi) + jz L(exit_tail5) + cmp $2, %esi + je L(exit_tail6) + + cmpb $0, 6(%rdi) + jz L(exit_tail6) + cmp $3, %esi + je L(exit_tail7) + + cmpb $0, 7(%rdi) + jz L(exit_tail7) + mov $8, %eax + ret + + .p2align 4 +L(len_less12_prolog): + add $4, %rsi + + cmpb $0, 8(%rdi) + jz L(exit_tail8) + cmp $1, %esi + je L(exit_tail9) + + cmpb $0, 9(%rdi) + jz L(exit_tail9) + cmp $2, %esi + je L(exit_tail10) + + cmpb $0, 10(%rdi) + jz L(exit_tail10) + cmp $3, %esi + je L(exit_tail11) + + cmpb $0, 11(%rdi) + jz L(exit_tail11) + mov $12, %eax + ret + + .p2align 4 +L(len_less16_prolog): + add $4, %rsi + + cmpb $0, 12(%rdi) + jz L(exit_tail12) + cmp $1, %esi + je L(exit_tail13) + + cmpb $0, 13(%rdi) + jz L(exit_tail13) + cmp $2, %esi + je L(exit_tail14) + + cmpb $0, 14(%rdi) + jz L(exit_tail14) + cmp $3, %esi + je L(exit_tail15) + + cmpb $0, 15(%rdi) + jz L(exit_tail15) + mov $16, %eax + ret +# endif + + .p2align 4 +L(exit_tail1): + add $1, %eax + RETURN + + .p2align 4 +L(exit_tail2): + add $2, %eax + RETURN + + .p2align 4 +L(exit_tail3): + add $3, %eax + RETURN + + .p2align 4 +L(exit_tail4): + add $4, %eax + RETURN + + .p2align 4 +L(exit_tail5): + add $5, %eax + RETURN + + .p2align 4 +L(exit_tail6): + add $6, %eax + RETURN + + .p2align 4 +L(exit_tail7): + add $7, %eax + RETURN + + .p2align 4 +L(exit_tail8): + add $8, %eax + RETURN + + .p2align 4 +L(exit_tail9): + add $9, %eax + RETURN + + .p2align 4 +L(exit_tail10): + add $10, %eax + RETURN + + .p2align 4 +L(exit_tail11): + add $11, %eax + RETURN + + .p2align 4 +L(exit_tail12): + add $12, %eax + RETURN + + .p2align 4 +L(exit_tail13): + add $13, %eax + RETURN + + .p2align 4 +L(exit_tail14): + add $14, %eax + RETURN + + .p2align 4 +L(exit_tail15): + add $15, %eax +# ifndef USE_AS_STRCAT + RETURN +END (STRLEN) +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S index 43e2100..7716d36 100644 --- a/sysdeps/x86_64/multiarch/strlen.S +++ b/sysdeps/x86_64/multiarch/strlen.S @@ -42,7 +42,7 @@ ENTRY(strlen) ret 2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip) jz 3f - leaq __strlen_no_bsf(%rip), %rax + leaq __strlen_sse2_no_bsf(%rip), %rax 3: ret END(strlen) diff --git a/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S new file mode 100644 index 0000000..248328d --- /dev/null +++ b/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNLEN +#define STRLEN __strnlen_sse2_no_bsf +#include "strlen-sse2-no-bsf.S" diff --git a/sysdeps/x86_64/multiarch/strnlen.S b/sysdeps/x86_64/multiarch/strnlen.S new file mode 100644 index 0000000..75c9adf --- /dev/null +++ b/sysdeps/x86_64/multiarch/strnlen.S @@ -0,0 +1,55 @@ +/* multiple version of strnlen + Copyright (C) 2011 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + + +/* Define multiple versions only for the definition in libc. */ +#ifndef NOT_IN_libc + + .text +ENTRY(__strnlen) + .type __strnlen, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq __strnlen_sse2(%rip), %rax + testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip) + jz 2f + leaq __strnlen_sse2_no_bsf(%rip), %rax +2: ret +END(__strnlen) + +# undef ENTRY +# define ENTRY(name) \ + .type __strnlen_sse2, @function; \ + .align 16; \ + __strnlen_sse2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __strnlen_sse2, .-__strnlen_sse2 + +# undef libc_hidden_def +# define libc_hidden_def(name) \ + .globl __GI_strnlen; __GI_strnlen = __strnlen_sse2 +#endif + +#include "../strnlen.S" diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S new file mode 100644 index 0000000..0343ac7 --- /dev/null +++ b/sysdeps/x86_64/wcslen.S @@ -0,0 +1,239 @@ +/* Optimized wcslen for x86-64 with SSE2. + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include + + .text +ENTRY (__wcslen) + cmpl $0, (%rdi) + jz L(exit_tail0) + cmpl $0, 4(%rdi) + jz L(exit_tail1) + cmpl $0, 8(%rdi) + jz L(exit_tail2) + cmpl $0, 12(%rdi) + jz L(exit_tail3) + cmpl $0, 16(%rdi) + jz L(exit_tail4) + cmpl $0, 20(%rdi) + jz L(exit_tail5) + cmpl $0, 24(%rdi) + jz L(exit_tail6) + cmpl $0, 28(%rdi) + jz L(exit_tail7) + + pxor %xmm0, %xmm0 + + lea 32(%rdi), %rax + lea 16(%rdi), %rcx + and $-16, %rax + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + and $-0x40, %rax + + .p2align 4 +L(aligned_64_loop): + movaps (%rax), %xmm0 + movaps 16(%rax), %xmm1 + movaps 32(%rax), %xmm2 + movaps 48(%rax), %xmm6 + + pminub %xmm1, %xmm0 + pminub %xmm6, %xmm2 + pminub %xmm0, %xmm2 + pcmpeqd %xmm3, %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 64(%rax), %rax + jz L(aligned_64_loop) + + pcmpeqd -64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 48(%rcx), %rcx + jnz L(exit) + + pcmpeqd %xmm1, %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%rcx), %rcx + jnz L(exit) + + pcmpeqd -32(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%rcx), %rcx + jnz L(exit) + + pcmpeqd %xmm6, %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%rcx), %rcx + jnz L(exit) + + jmp L(aligned_64_loop) + + .p2align 4 +L(exit): + sub %rcx, %rax + shr $2, %rax + test %dl, %dl + jz L(exit_high) + + mov %dl, %cl + and $15, %cl + jz L(exit_1) + ret + + .p2align 4 +L(exit_high): + mov %dh, %ch + and $15, %ch + jz L(exit_3) + add $2, %rax + ret + + .p2align 4 +L(exit_1): + add $1, %rax + ret + + .p2align 4 +L(exit_3): + add $3, %rax + ret + + .p2align 4 +L(exit_tail0): + xor %rax, %rax + ret + + .p2align 4 +L(exit_tail1): + mov $1, %rax + ret + + .p2align 4 +L(exit_tail2): + mov $2, %rax + ret + + .p2align 4 +L(exit_tail3): + mov $3, %rax + ret + + .p2align 4 +L(exit_tail4): + mov $4, %rax + ret + + .p2align 4 +L(exit_tail5): + mov $5, %rax + ret + + .p2align 4 +L(exit_tail6): + mov $6, %rax + ret + + .p2align 4 +L(exit_tail7): + mov $7, %rax + ret + +END (__wcslen) + +weak_alias(__wcslen, wcslen)