From: H.J. Lu Date: Fri, 24 Jun 2011 18:15:32 +0000 (-0400) Subject: Optimized st{r,p}{,n}cpy for SSE2/SSSE3 on x86-32 X-Git-Tag: glibc-2.15~513 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=0b1cbaaef5ccc21baf2c35d4698fb28e82eab385;p=platform%2Fupstream%2Fglibc.git Optimized st{r,p}{,n}cpy for SSE2/SSSE3 on x86-32 --- diff --git a/ChangeLog b/ChangeLog index b4d6496..097ad20 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,28 @@ +2011-06-22 H.J. Lu + + * sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add + strncpy-c strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 + strcpy-sse2 strncpy-sse2 stpcpy-sse2 stpncpy-sse2. + * sysdeps/i386/i686/multiarch/stpcpy-sse2.S: New file. + * sysdeps/i386/i686/multiarch/stpcpy-ssse3.S: New file. + * sysdeps/i386/i686/multiarch/stpncpy-sse2.S: New file. + * sysdeps/i386/i686/multiarch/stpncpy-ssse3.S: New file. + * sysdeps/i386/i686/multiarch/stpncpy.S : New file. + * sysdeps/i386/i686/multiarch/strcpy-sse2.S : New file. + * sysdeps/i386/i686/multiarch/strcpy-ssse3.S: New file. + * sysdeps/i386/i686/multiarch/strcpy.S: New file. + * sysdeps/i386/i686/multiarch/strncpy-c.c: New file. + * sysdeps/i386/i686/multiarch/strncpy-sse2.S: New file. + * sysdeps/i386/i686/multiarch/strncpy-ssse3.S: New file. + * sysdeps/i386/i686/multiarch/strncpy.S: New file. + * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): + Enable unaligned load optimization for Intel Core i3, i5 and i7 + processors. + * sysdeps/x86_64/multiarch/init-arch.h (bit_Fast_Unaligned_Load): + Define. + (index_Fast_Unaligned_Load): Define. + (HAS_FAST_UNALIGNED_LOAD): Define. + 2011-06-23 Marek Polacek * nss/nss_db/db-open.c: Include for read declaration. diff --git a/NEWS b/NEWS index 5a7ffc2..edb356d 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,4 @@ -GNU C Library NEWS -- history of user-visible changes. 2011-6-22 +GNU C Library NEWS -- history of user-visible changes. 2011-6-24 Copyright (C) 1992-2009, 2010, 2011 Free Software Foundation, Inc. See the end for copying conditions. @@ -17,6 +17,9 @@ Version 2.15 * Add nss_db support back to glibc. No more dependency on Berkeley db and support for initgroups lookups. Implemented by Ulrich Drepper. + +* Optimized strcpy, strncpy, stpcpy, stpncpy for SSE2 and SSSE3 on x86-32. + Contributed by HJ Lu. Version 2.14 diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile index 32286d8..4bae699 100644 --- a/sysdeps/i386/i686/multiarch/Makefile +++ b/sysdeps/i386/i686/multiarch/Makefile @@ -10,7 +10,9 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \ memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \ strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \ memcmp-ssse3 memcmp-sse4 strcasestr-nonascii varshift \ - strlen-sse2 strlen-sse2-bsf + strlen-sse2 strlen-sse2-bsf strncpy-c strcpy-ssse3 \ + strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 strcpy-sse2 \ + strncpy-sse2 stpcpy-sse2 stpncpy-sse2 ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c CFLAGS-varshift.c += -msse4 diff --git a/sysdeps/i386/i686/multiarch/stpcpy-sse2.S b/sysdeps/i386/i686/multiarch/stpcpy-sse2.S new file mode 100644 index 0000000..46ca1b3 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/stpcpy-sse2.S @@ -0,0 +1,3 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy_sse2 +#include "strcpy-sse2.S" diff --git a/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S b/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S new file mode 100644 index 0000000..d971c2d --- /dev/null +++ b/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S @@ -0,0 +1,3 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy_ssse3 +#include "strcpy-ssse3.S" diff --git a/sysdeps/i386/i686/multiarch/stpcpy.S b/sysdeps/i386/i686/multiarch/stpcpy.S new file mode 100644 index 0000000..b63d308 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/stpcpy.S @@ -0,0 +1,7 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy +#include "strcpy.S" + +weak_alias (__stpcpy, stpcpy) +libc_hidden_def (__stpcpy) +libc_hidden_builtin_def (stpcpy) diff --git a/sysdeps/i386/i686/multiarch/stpncpy-sse2.S b/sysdeps/i386/i686/multiarch/stpncpy-sse2.S new file mode 100644 index 0000000..37a703c --- /dev/null +++ b/sysdeps/i386/i686/multiarch/stpncpy-sse2.S @@ -0,0 +1,4 @@ +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#define STRCPY __stpncpy_sse2 +#include "strcpy-sse2.S" diff --git a/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S b/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S new file mode 100644 index 0000000..14ed16f --- /dev/null +++ b/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#define STRCPY __stpncpy_ssse3 +#include "strcpy-ssse3.S" diff --git a/sysdeps/i386/i686/multiarch/stpncpy.S b/sysdeps/i386/i686/multiarch/stpncpy.S new file mode 100644 index 0000000..ff89a89 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/stpncpy.S @@ -0,0 +1,6 @@ +#define STRCPY __stpncpy +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#include "strcpy.S" + +weak_alias (__stpncpy, stpncpy) diff --git a/sysdeps/i386/i686/multiarch/strcpy-sse2.S b/sysdeps/i386/i686/multiarch/strcpy-sse2.S new file mode 100644 index 0000000..fad1ae2 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strcpy-sse2.S @@ -0,0 +1,2251 @@ +/* strcpy with SSE2 and unaligned load + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + + +#ifndef NOT_IN_libc + +# include + + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifndef STRCPY +# define STRCPY __strcpy_sse2 +# endif + +# define STR1 PARMS +# define STR2 STR1+4 +# define LEN STR2+4 + +# ifdef USE_AS_STRNCPY +# define PARMS 16 +# define ENTRANCE PUSH(%ebx); PUSH(%esi); PUSH(%edi) +# define RETURN POP(%edi); POP(%esi); POP(%ebx); ret; \ + CFI_PUSH(%ebx); CFI_PUSH(%edi); CFI_PUSH(%edi); + +# ifdef SHARED +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into ECX and branch to it. TABLE is a + jump table with relative offsets. + INDEX is a register contains the index into the jump table. + SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into ECX. */ \ + call __i686.get_pc_thunk.cx; \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ecx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ecx,INDEX,SCALE), %ecx; \ + /* We loaded the jump table and adjuested ECX. Go. */ \ + jmp *%ecx +# else +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +# endif + +.text +ENTRY (STRCPY) + ENTRANCE + mov STR1(%esp), %edi + mov STR2(%esp), %esi + movl LEN(%esp), %ebx + test %ebx, %ebx + jz L(ExitZero) + + mov %esi, %ecx +# ifndef USE_AS_STPCPY + mov %edi, %eax /* save result */ +# endif + and $15, %ecx + jz L(SourceStringAlignmentZero) + + and $-16, %esi + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + pcmpeqb (%esi), %xmm1 + add %ecx, %ebx + pmovmskb %xmm1, %edx + shr %cl, %edx +# ifdef USE_AS_STPCPY + cmp $16, %ebx + jbe L(CopyFrom1To16BytesTailCase2OrCase3) +# else + cmp $17, %ebx + jbe L(CopyFrom1To16BytesTailCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16BytesTail) + + pcmpeqb 16(%esi), %xmm0 + pmovmskb %xmm0, %edx +# ifdef USE_AS_STPCPY + cmp $32, %ebx + jbe L(CopyFrom1To32BytesCase2OrCase3) +# else + cmp $33, %ebx + jbe L(CopyFrom1To32BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To32Bytes) + + movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */ + movdqu %xmm1, (%edi) + + sub %ecx, %edi + +/* If source adress alignment != destination adress alignment */ + .p2align 4 +L(Unalign16Both): + mov $16, %ecx + movdqa (%esi, %ecx), %xmm1 + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%edi, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $48, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm2) + + movaps 16(%esi, %ecx), %xmm3 + movdqu %xmm2, (%edi, %ecx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm3) + + movaps 16(%esi, %ecx), %xmm4 + movdqu %xmm3, (%edi, %ecx) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm4) + + movaps 16(%esi, %ecx), %xmm1 + movdqu %xmm4, (%edi, %ecx) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm1) + + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%edi, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm2) + + movaps 16(%esi, %ecx), %xmm3 + movdqu %xmm2, (%edi, %ecx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm3) + + movdqu %xmm3, (%edi, %ecx) + mov %esi, %edx + lea 16(%esi, %ecx), %esi + and $-0x40, %esi + sub %esi, %edx + sub %edx, %edi + lea 128(%ebx, %edx), %ebx + +L(Unaligned64Loop): + movaps (%esi), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%esi), %xmm5 + movaps 32(%esi), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%esi), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx + sub $64, %ebx + jbe L(UnalignedLeaveCase2OrCase3) + test %edx, %edx + jnz L(Unaligned64Leave) +L(Unaligned64Loop_start): + add $64, %edi + add $64, %esi + movdqu %xmm4, -64(%edi) + movaps (%esi), %xmm2 + movdqa %xmm2, %xmm4 + movdqu %xmm5, -48(%edi) + movaps 16(%esi), %xmm5 + pminub %xmm5, %xmm2 + movaps 32(%esi), %xmm3 + movdqu %xmm6, -32(%edi) + movaps %xmm3, %xmm6 + movdqu %xmm7, -16(%edi) + movaps 48(%esi), %xmm7 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx + sub $64, %ebx + jbe L(UnalignedLeaveCase2OrCase3) + test %edx, %edx + jz L(Unaligned64Loop_start) +L(Unaligned64Leave): + pxor %xmm1, %xmm1 + + pcmpeqb %xmm4, %xmm0 + pcmpeqb %xmm5, %xmm1 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %ecx + test %edx, %edx + jnz L(CopyFrom1To16BytesUnaligned_0) + test %ecx, %ecx + jnz L(CopyFrom1To16BytesUnaligned_16) + + pcmpeqb %xmm6, %xmm0 + pcmpeqb %xmm7, %xmm1 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %ecx + test %edx, %edx + jnz L(CopyFrom1To16BytesUnaligned_32) + + bsf %ecx, %edx + movdqu %xmm4, (%edi) + movdqu %xmm5, 16(%edi) + movdqu %xmm6, 32(%edi) +# ifdef USE_AS_STPCPY + lea 48(%edi, %edx), %eax +# endif + movdqu %xmm7, 48(%edi) + add $15, %ebx + sub %edx, %ebx + lea 49(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) + +/* If source adress alignment == destination adress alignment */ + +L(SourceStringAlignmentZero): + pxor %xmm0, %xmm0 + movdqa (%esi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx +# ifdef USE_AS_STPCPY + cmp $16, %ebx + jbe L(CopyFrom1To16BytesTail1Case2OrCase3) +# else + cmp $17, %ebx + jbe L(CopyFrom1To16BytesTail1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16BytesTail1) + + pcmpeqb 16(%esi), %xmm0 + movdqu %xmm1, (%edi) + pmovmskb %xmm0, %edx +# ifdef USE_AS_STPCPY + cmp $32, %ebx + jbe L(CopyFrom1To32Bytes1Case2OrCase3) +# else + cmp $33, %ebx + jbe L(CopyFrom1To32Bytes1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To32Bytes1) + + jmp L(Unalign16Both) + +/*-----------------End of main part---------------------------*/ + +/* Case1 */ + .p2align 4 +L(CopyFrom1To16BytesTail): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1): + add $16, %esi + add $16, %edi + sub $16, %ebx +L(CopyFrom1To16BytesTail1): + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes): + sub %ecx, %ebx + bsf %edx, %edx + add %ecx, %esi + add $16, %edx + sub %ecx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_0): + bsf %edx, %edx +# ifdef USE_AS_STPCPY + lea (%edi, %edx), %eax +# endif + movdqu %xmm4, (%edi) + add $63, %ebx + sub %edx, %ebx + lea 1(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_16): + bsf %ecx, %edx + movdqu %xmm4, (%edi) +# ifdef USE_AS_STPCPY + lea 16(%edi, %edx), %eax +# endif + movdqu %xmm5, 16(%edi) + add $47, %ebx + sub %edx, %ebx + lea 17(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_32): + bsf %edx, %edx + movdqu %xmm4, (%edi) + movdqu %xmm5, 16(%edi) +# ifdef USE_AS_STPCPY + lea 32(%edi, %edx), %eax +# endif + movdqu %xmm6, 32(%edi) + add $31, %ebx + sub %edx, %ebx + lea 33(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm6): + movdqu %xmm6, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm5): + movdqu %xmm5, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm4): + movdqu %xmm4, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm3): + movdqu %xmm3, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm1): + movdqu %xmm1, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesExit): + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + +/* Case2 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %ebx + add %ecx, %edi + add %ecx, %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + add $16, %edx + sub %ecx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + +L(CopyFrom1To16BytesTailCase2): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + +L(CopyFrom1To16BytesTail1Case2): + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + +/* Case2 or Case3, Case3 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesCase2) +L(CopyFrom1To16BytesCase3): + add $16, %ebx + add %ecx, %edi + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To32BytesCase2) + sub %ecx, %ebx + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To16BytesTailCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesTailCase2) + sub %ecx, %ebx + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1Case2OrCase3): + add $16, %edi + add $16, %esi + sub $16, %ebx +L(CopyFrom1To16BytesTail1Case2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesTail1Case2) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(Exit0): +# ifdef USE_AS_STPCPY + mov %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit1): + movb %dh, (%edi) +# ifdef USE_AS_STPCPY + lea (%edi), %eax +# endif + sub $1, %ebx + lea 1(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit2): + movw (%esi), %dx + movw %dx, (%edi) +# ifdef USE_AS_STPCPY + lea 1(%edi), %eax +# endif + sub $2, %ebx + lea 2(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit3): + movw (%esi), %cx + movw %cx, (%edi) + movb %dh, 2(%edi) +# ifdef USE_AS_STPCPY + lea 2(%edi), %eax +# endif + sub $3, %ebx + lea 3(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit4): + movl (%esi), %edx + movl %edx, (%edi) +# ifdef USE_AS_STPCPY + lea 3(%edi), %eax +# endif + sub $4, %ebx + lea 4(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit5): + movl (%esi), %ecx + movb %dh, 4(%edi) + movl %ecx, (%edi) +# ifdef USE_AS_STPCPY + lea 4(%edi), %eax +# endif + sub $5, %ebx + lea 5(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit6): + movl (%esi), %ecx + movw 4(%esi), %dx + movl %ecx, (%edi) + movw %dx, 4(%edi) +# ifdef USE_AS_STPCPY + lea 5(%edi), %eax +# endif + sub $6, %ebx + lea 6(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit7): + movl (%esi), %ecx + movl 3(%esi), %edx + movl %ecx, (%edi) + movl %edx, 3(%edi) +# ifdef USE_AS_STPCPY + lea 6(%edi), %eax +# endif + sub $7, %ebx + lea 7(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit8): + movlpd (%esi), %xmm0 + movlpd %xmm0, (%edi) +# ifdef USE_AS_STPCPY + lea 7(%edi), %eax +# endif + sub $8, %ebx + lea 8(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit9): + movlpd (%esi), %xmm0 + movb %dh, 8(%edi) + movlpd %xmm0, (%edi) +# ifdef USE_AS_STPCPY + lea 8(%edi), %eax +# endif + sub $9, %ebx + lea 9(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit10): + movlpd (%esi), %xmm0 + movw 8(%esi), %dx + movlpd %xmm0, (%edi) + movw %dx, 8(%edi) +# ifdef USE_AS_STPCPY + lea 9(%edi), %eax +# endif + sub $10, %ebx + lea 10(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit11): + movlpd (%esi), %xmm0 + movl 7(%esi), %edx + movlpd %xmm0, (%edi) + movl %edx, 7(%edi) +# ifdef USE_AS_STPCPY + lea 10(%edi), %eax +# endif + sub $11, %ebx + lea 11(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit12): + movlpd (%esi), %xmm0 + movl 8(%esi), %edx + movlpd %xmm0, (%edi) + movl %edx, 8(%edi) +# ifdef USE_AS_STPCPY + lea 11(%edi), %eax +# endif + sub $12, %ebx + lea 12(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit13): + movlpd (%esi), %xmm0 + movlpd 5(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 5(%edi) +# ifdef USE_AS_STPCPY + lea 12(%edi), %eax +# endif + sub $13, %ebx + lea 13(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit14): + movlpd (%esi), %xmm0 + movlpd 6(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 6(%edi) +# ifdef USE_AS_STPCPY + lea 13(%edi), %eax +# endif + sub $14, %ebx + lea 14(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit15): + movlpd (%esi), %xmm0 + movlpd 7(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 7(%edi) +# ifdef USE_AS_STPCPY + lea 14(%edi), %eax +# endif + sub $15, %ebx + lea 15(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit16): + movdqu (%esi), %xmm0 + movdqu %xmm0, (%edi) +# ifdef USE_AS_STPCPY + lea 15(%edi), %eax +# endif + sub $16, %ebx + lea 16(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit17): + movdqu (%esi), %xmm0 + movdqu %xmm0, (%edi) + movb %dh, 16(%edi) +# ifdef USE_AS_STPCPY + lea 16(%edi), %eax +# endif + sub $17, %ebx + lea 17(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit18): + movdqu (%esi), %xmm0 + movw 16(%esi), %cx + movdqu %xmm0, (%edi) + movw %cx, 16(%edi) +# ifdef USE_AS_STPCPY + lea 17(%edi), %eax +# endif + sub $18, %ebx + lea 18(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit19): + movdqu (%esi), %xmm0 + movl 15(%esi), %ecx + movdqu %xmm0, (%edi) + movl %ecx, 15(%edi) +# ifdef USE_AS_STPCPY + lea 18(%edi), %eax +# endif + sub $19, %ebx + lea 19(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit20): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx + movdqu %xmm0, (%edi) + movl %ecx, 16(%edi) +# ifdef USE_AS_STPCPY + lea 19(%edi), %eax +# endif + sub $20, %ebx + lea 20(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit21): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx + movdqu %xmm0, (%edi) + movl %ecx, 16(%edi) + movb %dh, 20(%edi) +# ifdef USE_AS_STPCPY + lea 20(%edi), %eax +# endif + sub $21, %ebx + lea 21(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit22): + movdqu (%esi), %xmm0 + movlpd 14(%esi), %xmm3 + movdqu %xmm0, (%edi) + movlpd %xmm3, 14(%edi) +# ifdef USE_AS_STPCPY + lea 21(%edi), %eax +# endif + sub $22, %ebx + lea 22(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit23): + movdqu (%esi), %xmm0 + movlpd 15(%esi), %xmm3 + movdqu %xmm0, (%edi) + movlpd %xmm3, 15(%edi) +# ifdef USE_AS_STPCPY + lea 22(%edi), %eax +# endif + sub $23, %ebx + lea 23(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit24): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) +# ifdef USE_AS_STPCPY + lea 23(%edi), %eax +# endif + sub $24, %ebx + lea 24(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit25): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movb %dh, 24(%edi) +# ifdef USE_AS_STPCPY + lea 24(%edi), %eax +# endif + sub $25, %ebx + lea 25(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit26): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movw 24(%esi), %cx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movw %cx, 24(%edi) +# ifdef USE_AS_STPCPY + lea 25(%edi), %eax +# endif + sub $26, %ebx + lea 26(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit27): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 23(%esi), %ecx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movl %ecx, 23(%edi) +# ifdef USE_AS_STPCPY + lea 26(%edi), %eax +# endif + sub $27, %ebx + lea 27(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit28): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 24(%esi), %ecx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movl %ecx, 24(%edi) +# ifdef USE_AS_STPCPY + lea 27(%edi), %eax +# endif + sub $28, %ebx + lea 28(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit29): + movdqu (%esi), %xmm0 + movdqu 13(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 13(%edi) +# ifdef USE_AS_STPCPY + lea 28(%edi), %eax +# endif + sub $29, %ebx + lea 29(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit30): + movdqu (%esi), %xmm0 + movdqu 14(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 14(%edi) +# ifdef USE_AS_STPCPY + lea 29(%edi), %eax +# endif + sub $30, %ebx + lea 30(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + + .p2align 4 +L(Exit31): + movdqu (%esi), %xmm0 + movdqu 15(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 15(%edi) +# ifdef USE_AS_STPCPY + lea 30(%edi), %eax +# endif + sub $31, %ebx + lea 31(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit32): + movdqu (%esi), %xmm0 + movdqu 16(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 16(%edi) +# ifdef USE_AS_STPCPY + lea 31(%edi), %eax +# endif + sub $32, %ebx + lea 32(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(StrncpyExit1): + movb (%esi), %dl + movb %dl, (%edi) +# ifdef USE_AS_STPCPY + lea 1(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit2): + movw (%esi), %dx + movw %dx, (%edi) +# ifdef USE_AS_STPCPY + lea 2(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit3): + movw (%esi), %cx + movb 2(%esi), %dl + movw %cx, (%edi) + movb %dl, 2(%edi) +# ifdef USE_AS_STPCPY + lea 3(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit4): + movl (%esi), %edx + movl %edx, (%edi) +# ifdef USE_AS_STPCPY + lea 4(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit5): + movl (%esi), %ecx + movb 4(%esi), %dl + movl %ecx, (%edi) + movb %dl, 4(%edi) +# ifdef USE_AS_STPCPY + lea 5(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit6): + movl (%esi), %ecx + movw 4(%esi), %dx + movl %ecx, (%edi) + movw %dx, 4(%edi) +# ifdef USE_AS_STPCPY + lea 6(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit7): + movl (%esi), %ecx + movl 3(%esi), %edx + movl %ecx, (%edi) + movl %edx, 3(%edi) +# ifdef USE_AS_STPCPY + lea 7(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit8): + movlpd (%esi), %xmm0 + movlpd %xmm0, (%edi) +# ifdef USE_AS_STPCPY + lea 8(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit9): + movlpd (%esi), %xmm0 + movb 8(%esi), %dl + movlpd %xmm0, (%edi) + movb %dl, 8(%edi) +# ifdef USE_AS_STPCPY + lea 9(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit10): + movlpd (%esi), %xmm0 + movw 8(%esi), %dx + movlpd %xmm0, (%edi) + movw %dx, 8(%edi) +# ifdef USE_AS_STPCPY + lea 10(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit11): + movlpd (%esi), %xmm0 + movl 7(%esi), %edx + movlpd %xmm0, (%edi) + movl %edx, 7(%edi) +# ifdef USE_AS_STPCPY + lea 11(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit12): + movlpd (%esi), %xmm0 + movl 8(%esi), %edx + movlpd %xmm0, (%edi) + movl %edx, 8(%edi) +# ifdef USE_AS_STPCPY + lea 12(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit13): + movlpd (%esi), %xmm0 + movlpd 5(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 5(%edi) +# ifdef USE_AS_STPCPY + lea 13(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit14): + movlpd (%esi), %xmm0 + movlpd 6(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 6(%edi) +# ifdef USE_AS_STPCPY + lea 14(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit15): + movlpd (%esi), %xmm0 + movlpd 7(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 7(%edi) +# ifdef USE_AS_STPCPY + lea 15(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit16): + movdqu (%esi), %xmm0 + movdqu %xmm0, (%edi) +# ifdef USE_AS_STPCPY + lea 16(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit17): + movdqu (%esi), %xmm0 + movb 16(%esi), %cl + movdqu %xmm0, (%edi) + movb %cl, 16(%edi) +# ifdef USE_AS_STPCPY + lea 17(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit18): + movdqu (%esi), %xmm0 + movw 16(%esi), %cx + movdqu %xmm0, (%edi) + movw %cx, 16(%edi) +# ifdef USE_AS_STPCPY + lea 18(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit19): + movdqu (%esi), %xmm0 + movl 15(%esi), %ecx + movdqu %xmm0, (%edi) + movl %ecx, 15(%edi) +# ifdef USE_AS_STPCPY + lea 19(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit20): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx + movdqu %xmm0, (%edi) + movl %ecx, 16(%edi) +# ifdef USE_AS_STPCPY + lea 20(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit21): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx + movb 20(%esi), %dl + movdqu %xmm0, (%edi) + movl %ecx, 16(%edi) + movb %dl, 20(%edi) +# ifdef USE_AS_STPCPY + lea 21(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit22): + movdqu (%esi), %xmm0 + movlpd 14(%esi), %xmm3 + movdqu %xmm0, (%edi) + movlpd %xmm3, 14(%edi) +# ifdef USE_AS_STPCPY + lea 22(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit23): + movdqu (%esi), %xmm0 + movlpd 15(%esi), %xmm3 + movdqu %xmm0, (%edi) + movlpd %xmm3, 15(%edi) +# ifdef USE_AS_STPCPY + lea 23(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit24): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) +# ifdef USE_AS_STPCPY + lea 24(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit25): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movb 24(%esi), %cl + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movb %cl, 24(%edi) +# ifdef USE_AS_STPCPY + lea 25(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit26): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movw 24(%esi), %cx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movw %cx, 24(%edi) +# ifdef USE_AS_STPCPY + lea 26(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit27): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 23(%esi), %ecx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movl %ecx, 23(%edi) +# ifdef USE_AS_STPCPY + lea 27(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit28): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 24(%esi), %ecx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movl %ecx, 24(%edi) +# ifdef USE_AS_STPCPY + lea 28(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit29): + movdqu (%esi), %xmm0 + movdqu 13(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 13(%edi) +# ifdef USE_AS_STPCPY + lea 29(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit30): + movdqu (%esi), %xmm0 + movdqu 14(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 14(%edi) +# ifdef USE_AS_STPCPY + lea 30(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit31): + movdqu (%esi), %xmm0 + movdqu 15(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 15(%edi) +# ifdef USE_AS_STPCPY + lea 31(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit32): + movdqu (%esi), %xmm0 + movdqu 16(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 16(%edi) +# ifdef USE_AS_STPCPY + lea 32(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit33): + movdqu (%esi), %xmm0 + movdqu 16(%esi), %xmm2 + movb 32(%esi), %cl + movdqu %xmm0, (%edi) + movdqu %xmm2, 16(%edi) + movb %cl, 32(%edi) + RETURN + + .p2align 4 +L(Fill0): + RETURN + + .p2align 4 +L(Fill1): + movb %dl, (%edi) + RETURN + + .p2align 4 +L(Fill2): + movw %dx, (%edi) + RETURN + + .p2align 4 +L(Fill3): + movl %edx, -1(%edi) + RETURN + + .p2align 4 +L(Fill4): + movl %edx, (%edi) + RETURN + + .p2align 4 +L(Fill5): + movl %edx, (%edi) + movb %dl, 4(%edi) + RETURN + + .p2align 4 +L(Fill6): + movl %edx, (%edi) + movw %dx, 4(%edi) + RETURN + + .p2align 4 +L(Fill7): + movlpd %xmm0, -1(%edi) + RETURN + + .p2align 4 +L(Fill8): + movlpd %xmm0, (%edi) + RETURN + + .p2align 4 +L(Fill9): + movlpd %xmm0, (%edi) + movb %dl, 8(%edi) + RETURN + + .p2align 4 +L(Fill10): + movlpd %xmm0, (%edi) + movw %dx, 8(%edi) + RETURN + + .p2align 4 +L(Fill11): + movlpd %xmm0, (%edi) + movl %edx, 7(%edi) + RETURN + + .p2align 4 +L(Fill12): + movlpd %xmm0, (%edi) + movl %edx, 8(%edi) + RETURN + + .p2align 4 +L(Fill13): + movlpd %xmm0, (%edi) + movlpd %xmm0, 5(%edi) + RETURN + + .p2align 4 +L(Fill14): + movlpd %xmm0, (%edi) + movlpd %xmm0, 6(%edi) + RETURN + + .p2align 4 +L(Fill15): + movdqu %xmm0, -1(%edi) + RETURN + + .p2align 4 +L(Fill16): + movdqu %xmm0, (%edi) + RETURN + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm2): + movdqu %xmm2, (%edi, %ecx) + + .p2align 4 +L(CopyFrom1To16BytesXmmExit): + bsf %edx, %edx + add $15, %ebx + add %ecx, %edi +# ifdef USE_AS_STPCPY + lea (%edi, %edx), %eax +# endif + sub %edx, %ebx + lea 1(%edi, %edx), %edi + + .p2align 4 +L(StrncpyFillTailWithZero): + pxor %xmm0, %xmm0 + xor %edx, %edx + sub $16, %ebx + jbe L(StrncpyFillExit) + + movdqu %xmm0, (%edi) + add $16, %edi + + mov %edi, %esi + and $0xf, %esi + sub %esi, %edi + add %esi, %ebx + sub $64, %ebx + jb L(StrncpyFillLess64) + +L(StrncpyFillLoopMovdqa): + movdqa %xmm0, (%edi) + movdqa %xmm0, 16(%edi) + movdqa %xmm0, 32(%edi) + movdqa %xmm0, 48(%edi) + add $64, %edi + sub $64, %ebx + jae L(StrncpyFillLoopMovdqa) + +L(StrncpyFillLess64): + add $32, %ebx + jl L(StrncpyFillLess32) + movdqa %xmm0, (%edi) + movdqa %xmm0, 16(%edi) + add $32, %edi + sub $16, %ebx + jl L(StrncpyFillExit) + movdqa %xmm0, (%edi) + add $16, %edi + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4) + +L(StrncpyFillLess32): + add $16, %ebx + jl L(StrncpyFillExit) + movdqa %xmm0, (%edi) + add $16, %edi + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4) + +L(StrncpyFillExit): + add $16, %ebx + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4) + + .p2align 4 +L(UnalignedLeaveCase2OrCase3): + test %edx, %edx + jnz L(Unaligned64LeaveCase2) +L(Unaligned64LeaveCase3): + lea 64(%ebx), %ecx + and $-16, %ecx + add $48, %ebx + jl L(CopyFrom1To16BytesCase3) + movdqu %xmm4, (%edi) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm5, 16(%edi) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm6, 32(%edi) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm7, 48(%edi) +# ifdef USE_AS_STPCPY + lea 64(%edi), %eax +# endif + RETURN + + .p2align 4 +L(Unaligned64LeaveCase2): + xor %ecx, %ecx + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %edx + add $48, %ebx + jle L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm4) + + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm4, (%edi) + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm5) + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm5, 16(%edi) + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm6) + + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm6, 32(%edi) + lea 16(%edi, %ecx), %edi + lea 16(%esi, %ecx), %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(ExitZero): + movl %edi, %eax + RETURN + +END (STRCPY) + + .p2align 4 + .section .rodata +L(ExitTable): + .int JMPTBL(L(Exit1), L(ExitTable)) + .int JMPTBL(L(Exit2), L(ExitTable)) + .int JMPTBL(L(Exit3), L(ExitTable)) + .int JMPTBL(L(Exit4), L(ExitTable)) + .int JMPTBL(L(Exit5), L(ExitTable)) + .int JMPTBL(L(Exit6), L(ExitTable)) + .int JMPTBL(L(Exit7), L(ExitTable)) + .int JMPTBL(L(Exit8), L(ExitTable)) + .int JMPTBL(L(Exit9), L(ExitTable)) + .int JMPTBL(L(Exit10), L(ExitTable)) + .int JMPTBL(L(Exit11), L(ExitTable)) + .int JMPTBL(L(Exit12), L(ExitTable)) + .int JMPTBL(L(Exit13), L(ExitTable)) + .int JMPTBL(L(Exit14), L(ExitTable)) + .int JMPTBL(L(Exit15), L(ExitTable)) + .int JMPTBL(L(Exit16), L(ExitTable)) + .int JMPTBL(L(Exit17), L(ExitTable)) + .int JMPTBL(L(Exit18), L(ExitTable)) + .int JMPTBL(L(Exit19), L(ExitTable)) + .int JMPTBL(L(Exit20), L(ExitTable)) + .int JMPTBL(L(Exit21), L(ExitTable)) + .int JMPTBL(L(Exit22), L(ExitTable)) + .int JMPTBL(L(Exit23), L(ExitTable)) + .int JMPTBL(L(Exit24), L(ExitTable)) + .int JMPTBL(L(Exit25), L(ExitTable)) + .int JMPTBL(L(Exit26), L(ExitTable)) + .int JMPTBL(L(Exit27), L(ExitTable)) + .int JMPTBL(L(Exit28), L(ExitTable)) + .int JMPTBL(L(Exit29), L(ExitTable)) + .int JMPTBL(L(Exit30), L(ExitTable)) + .int JMPTBL(L(Exit31), L(ExitTable)) + .int JMPTBL(L(Exit32), L(ExitTable)) + +L(ExitStrncpyTable): + .int JMPTBL(L(Exit0), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable)) + + .p2align 4 +L(FillTable): + .int JMPTBL(L(Fill0), L(FillTable)) + .int JMPTBL(L(Fill1), L(FillTable)) + .int JMPTBL(L(Fill2), L(FillTable)) + .int JMPTBL(L(Fill3), L(FillTable)) + .int JMPTBL(L(Fill4), L(FillTable)) + .int JMPTBL(L(Fill5), L(FillTable)) + .int JMPTBL(L(Fill6), L(FillTable)) + .int JMPTBL(L(Fill7), L(FillTable)) + .int JMPTBL(L(Fill8), L(FillTable)) + .int JMPTBL(L(Fill9), L(FillTable)) + .int JMPTBL(L(Fill10), L(FillTable)) + .int JMPTBL(L(Fill11), L(FillTable)) + .int JMPTBL(L(Fill12), L(FillTable)) + .int JMPTBL(L(Fill13), L(FillTable)) + .int JMPTBL(L(Fill14), L(FillTable)) + .int JMPTBL(L(Fill15), L(FillTable)) + .int JMPTBL(L(Fill16), L(FillTable)) +# else +# define PARMS 4 +# define ENTRANCE +# define RETURN POP (%edi); ret; CFI_PUSH (%edi) +# define RETURN1 ret + + .text +ENTRY (STRCPY) + ENTRANCE + mov STR1(%esp), %edx + mov STR2(%esp), %ecx + + cmpb $0, (%ecx) + jz L(ExitTail1) + cmpb $0, 1(%ecx) + jz L(ExitTail2) + cmpb $0, 2(%ecx) + jz L(ExitTail3) + cmpb $0, 3(%ecx) + jz L(ExitTail4) + cmpb $0, 4(%ecx) + jz L(ExitTail5) + cmpb $0, 5(%ecx) + jz L(ExitTail6) + cmpb $0, 6(%ecx) + jz L(ExitTail7) + cmpb $0, 7(%ecx) + jz L(ExitTail8) + cmpb $0, 8(%ecx) + jz L(ExitTail9) + cmpb $0, 9(%ecx) + jz L(ExitTail10) + cmpb $0, 10(%ecx) + jz L(ExitTail11) + cmpb $0, 11(%ecx) + jz L(ExitTail12) + cmpb $0, 12(%ecx) + jz L(ExitTail13) + cmpb $0, 13(%ecx) + jz L(ExitTail14) + cmpb $0, 14(%ecx) + jz L(ExitTail15) + cmpb $0, 15(%ecx) + jz L(ExitTail16) + + PUSH (%edi) + PUSH (%ebx) + + mov %edx, %edi + lea 16(%ecx), %ebx + and $-16, %ebx + pxor %xmm0, %xmm0 + movdqu (%ecx), %xmm1 + movdqu %xmm1, (%edx) + pcmpeqb (%ebx), %xmm0 + pmovmskb %xmm0, %eax + sub %ecx, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + mov %ecx, %eax + lea 16(%ecx), %ecx + and $-16, %ecx + sub %ecx, %eax + sub %eax, %edx + xor %ebx, %ebx + + .p2align 4 + movdqa (%ecx), %xmm1 + movaps 16(%ecx), %xmm2 + movdqu %xmm1, (%edx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + add $16, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %ebx), %xmm3 + movdqu %xmm2, (%edx, %ebx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %eax + add $16, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %ebx), %xmm4 + movdqu %xmm3, (%edx, %ebx) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %eax + add $16, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %ebx), %xmm1 + movdqu %xmm4, (%edx, %ebx) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + add $16, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %ebx), %xmm2 + movdqu %xmm1, (%edx, %ebx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + add $16, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %ebx), %xmm3 + movdqu %xmm2, (%edx, %ebx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %eax + add $16, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movdqu %xmm3, (%edx, %ebx) + mov %ecx, %eax + lea 16(%ecx, %ebx), %ecx + and $-0x40, %ecx + sub %ecx, %eax + sub %eax, %edx + +L(Aligned64Loop): + movaps (%ecx), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%ecx), %xmm5 + movaps 32(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%ecx), %xmm7 + pminub %xmm5, %xmm2 + add $64, %ecx + pminub %xmm7, %xmm3 + add $64, %edx + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(Aligned64Leave) +L(Aligned64Loop_start): + movdqu %xmm4, -64(%edx) + movaps (%ecx), %xmm2 + movdqa %xmm2, %xmm4 + movdqu %xmm5, -48(%edx) + movaps 16(%ecx), %xmm5 + pminub %xmm5, %xmm2 + movaps 32(%ecx), %xmm3 + movdqu %xmm6, -32(%edx) + movaps %xmm3, %xmm6 + movdqu %xmm7, -16(%edx) + movaps 48(%ecx), %xmm7 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %eax + add $64, %edx + add $64, %ecx + test %eax, %eax + jz L(Aligned64Loop_start) +L(Aligned64Leave): + sub $0xa0, %ebx + pxor %xmm0, %xmm0 + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %eax + movdqu %xmm4, -64(%edx) + test %eax, %eax + lea 16(%ebx), %ebx + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %eax + movdqu %xmm5, -48(%edx) + test %eax, %eax + lea 16(%ebx), %ebx + jnz L(CopyFrom1To16Bytes) + + movdqu %xmm6, -32(%edx) + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%ebx), %ebx + +/*-----------------End of main part---------------------------*/ + + .p2align 4 +L(CopyFrom1To16Bytes): + add %ebx, %edx + add %ebx, %ecx + + POP (%ebx) + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + test $0x08, %al + jnz L(Exit4) + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + /* Exit 8 */ + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) +# ifdef USE_AS_STPCPY + lea 7(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + test $0x08, %ah + jnz L(Exit12) + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + /* Exit 16 */ + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 8(%ecx), %xmm0 + movlpd %xmm0, 8(%edx) +# ifdef USE_AS_STPCPY + lea 15(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit1): + movb (%ecx), %al + movb %al, (%edx) +# ifdef USE_AS_STPCPY + lea (%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit2): + movw (%ecx), %ax + movw %ax, (%edx) +# ifdef USE_AS_STPCPY + lea 1(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit3): + movw (%ecx), %ax + movw %ax, (%edx) + movb 2(%ecx), %al + movb %al, 2(%edx) +# ifdef USE_AS_STPCPY + lea 2(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit4): + movl (%ecx), %eax + movl %eax, (%edx) +# ifdef USE_AS_STPCPY + lea 3(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit5): + movl (%ecx), %eax + movl %eax, (%edx) + movb 4(%ecx), %al + movb %al, 4(%edx) +# ifdef USE_AS_STPCPY + lea 4(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit6): + movl (%ecx), %eax + movl %eax, (%edx) + movw 4(%ecx), %ax + movw %ax, 4(%edx) +# ifdef USE_AS_STPCPY + lea 5(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit7): + movl (%ecx), %eax + movl %eax, (%edx) + movl 3(%ecx), %eax + movl %eax, 3(%edx) +# ifdef USE_AS_STPCPY + lea 6(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit9): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movb 8(%ecx), %al + movb %al, 8(%edx) +# ifdef USE_AS_STPCPY + lea 8(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit10): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movw 8(%ecx), %ax + movw %ax, 8(%edx) +# ifdef USE_AS_STPCPY + lea 9(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit11): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movl 7(%ecx), %eax + movl %eax, 7(%edx) +# ifdef USE_AS_STPCPY + lea 10(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit12): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movl 8(%ecx), %eax + movl %eax, 8(%edx) +# ifdef USE_AS_STPCPY + lea 11(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit13): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 5(%ecx), %xmm0 + movlpd %xmm0, 5(%edx) +# ifdef USE_AS_STPCPY + lea 12(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit14): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 6(%ecx), %xmm0 + movlpd %xmm0, 6(%edx) +# ifdef USE_AS_STPCPY + lea 13(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit15): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 7(%ecx), %xmm0 + movlpd %xmm0, 7(%edx) +# ifdef USE_AS_STPCPY + lea 14(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + +CFI_POP (%edi) + + .p2align 4 +L(ExitTail1): + movb (%ecx), %al + movb %al, (%edx) + movl %edx, %eax + RETURN1 + + .p2align 4 +L(ExitTail2): + movw (%ecx), %ax + movw %ax, (%edx) +# ifdef USE_AS_STPCPY + lea 1(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail3): + movw (%ecx), %ax + movw %ax, (%edx) + movb 2(%ecx), %al + movb %al, 2(%edx) +# ifdef USE_AS_STPCPY + lea 2(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail4): + movl (%ecx), %eax + movl %eax, (%edx) +# ifdef USE_AS_STPCPY + lea 3(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail5): + movl (%ecx), %eax + movl %eax, (%edx) + movb 4(%ecx), %al + movb %al, 4(%edx) +# ifdef USE_AS_STPCPY + lea 4(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail6): + movl (%ecx), %eax + movl %eax, (%edx) + movw 4(%ecx), %ax + movw %ax, 4(%edx) +# ifdef USE_AS_STPCPY + lea 5(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail7): + movl (%ecx), %eax + movl %eax, (%edx) + movl 3(%ecx), %eax + movl %eax, 3(%edx) +# ifdef USE_AS_STPCPY + lea 6(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail8): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) +# ifdef USE_AS_STPCPY + lea 7(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail9): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movb 8(%ecx), %al + movb %al, 8(%edx) +# ifdef USE_AS_STPCPY + lea 8(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail10): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movw 8(%ecx), %ax + movw %ax, 8(%edx) +# ifdef USE_AS_STPCPY + lea 9(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail11): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movl 7(%ecx), %eax + movl %eax, 7(%edx) +# ifdef USE_AS_STPCPY + lea 10(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail12): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movl 8(%ecx), %eax + movl %eax, 8(%edx) +# ifdef USE_AS_STPCPY + lea 11(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail13): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 5(%ecx), %xmm0 + movlpd %xmm0, 5(%edx) +# ifdef USE_AS_STPCPY + lea 12(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail14): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 6(%ecx), %xmm0 + movlpd %xmm0, 6(%edx) +# ifdef USE_AS_STPCPY + lea 13(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail15): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 7(%ecx), %xmm0 + movlpd %xmm0, 7(%edx) +# ifdef USE_AS_STPCPY + lea 14(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail16): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 8(%ecx), %xmm0 + movlpd %xmm0, 8(%edx) +# ifdef USE_AS_STPCPY + lea 15(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + +END (STRCPY) +# endif + +#endif diff --git a/sysdeps/i386/i686/multiarch/strcpy-ssse3.S b/sysdeps/i386/i686/multiarch/strcpy-ssse3.S new file mode 100644 index 0000000..577d117 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strcpy-ssse3.S @@ -0,0 +1,4090 @@ +/* strcpy with SSSE3 + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + + +#ifndef NOT_IN_libc + +# include + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifndef STRCPY +# define STRCPY __strcpy_ssse3 +# endif + +# ifdef USE_AS_STRNCPY +# define PARMS 8 +# define ENTRANCE PUSH(%ebx) +# define RETURN POP(%ebx); ret; CFI_PUSH(%ebx); +# define RETURN1 POP(%edi); POP(%ebx); ret; CFI_PUSH(%ebx); CFI_PUSH(%edi) +# else +# define PARMS 4 +# define ENTRANCE +# define RETURN ret +# define RETURN1 POP(%edi); ret; CFI_PUSH(%edi) +# endif + +# define STR1 PARMS +# define STR2 STR1+4 +# define LEN STR2+4 + +/* In this code following instructions are used for copying: + movb - 1 byte + movw - 2 byte + movl - 4 byte + movlpd - 8 byte + movaps - 16 byte - requires 16 byte alignment + of sourse and destination adresses. + 16 byte alignment: adress is 32bit value, + right four bit of adress shall be 0. +*/ + +.text +ENTRY (STRCPY) + ENTRANCE + mov STR1(%esp), %edx + mov STR2(%esp), %ecx +# ifdef USE_AS_STRNCPY + movl LEN(%esp), %ebx + test %ebx, %ebx + jz L(ExitTail0) + cmp $8, %ebx + jbe L(StrncpyExit8Bytes) +# endif + cmpb $0, (%ecx) + jz L(ExitTail1) + cmpb $0, 1(%ecx) + jz L(ExitTail2) + cmpb $0, 2(%ecx) + jz L(ExitTail3) + cmpb $0, 3(%ecx) + jz L(ExitTail4) + cmpb $0, 4(%ecx) + jz L(ExitTail5) + cmpb $0, 5(%ecx) + jz L(ExitTail6) + cmpb $0, 6(%ecx) + jz L(ExitTail7) + cmpb $0, 7(%ecx) + jz L(ExitTail8) +# ifdef USE_AS_STRNCPY + cmp $16, %ebx + jb L(StrncpyExit15Bytes) +# endif + cmpb $0, 8(%ecx) + jz L(ExitTail9) + cmpb $0, 9(%ecx) + jz L(ExitTail10) + cmpb $0, 10(%ecx) + jz L(ExitTail11) + cmpb $0, 11(%ecx) + jz L(ExitTail12) + cmpb $0, 12(%ecx) + jz L(ExitTail13) + cmpb $0, 13(%ecx) + jz L(ExitTail14) + cmpb $0, 14(%ecx) + jz L(ExitTail15) +# ifdef USE_AS_STRNCPY + cmp $16, %ebx + je L(ExitTail16) +# endif + cmpb $0, 15(%ecx) + jz L(ExitTail16) + + PUSH (%edi) + mov %edx, %edi + PUSH (%esi) +# ifdef USE_AS_STRNCPY + mov %ecx, %esi + and $0xf, %esi + +/* add 16 bytes ecx_shift to ebx */ + + add %esi, %ebx +# endif + lea 16(%ecx), %esi +/* Now: + esi = alignment_16(ecx) + ecx_shift + 16; + ecx_shift = ecx - alignment_16(ecx) +*/ + and $-16, %esi +/* Now: + esi = alignment_16(ecx) + 16 +*/ + pxor %xmm0, %xmm0 + movlpd (%ecx), %xmm1 + movlpd %xmm1, (%edx) +/* + look if there is zero symbol in next 16 bytes of string + from esi to esi + 15 and form mask in xmm0 +*/ + pcmpeqb (%esi), %xmm0 + movlpd 8(%ecx), %xmm1 + movlpd %xmm1, 8(%edx) + +/* convert byte mask in xmm0 to bit mask */ + + pmovmskb %xmm0, %eax + sub %ecx, %esi + +/* esi = 16 - ecx_shift */ + +/* eax = 0: there isn't end of string from position esi to esi+15 */ + +# ifdef USE_AS_STRNCPY + sub $32, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + mov %edx, %eax + lea 16(%edx), %edx +/* Now: + edx = edx + 16 = alignment_16(edx) + edx_shift + 16 +*/ + and $-16, %edx + +/* Now: edx = alignment_16(edx) + 16 */ + + sub %edx, %eax + +/* Now: eax = edx_shift - 16 */ + +# ifdef USE_AS_STRNCPY + add %eax, %esi + lea -1(%esi), %esi + and $1<<31, %esi + test %esi, %esi + jnz L(ContinueCopy) + lea 16(%ebx), %ebx + +L(ContinueCopy): +# endif + sub %eax, %ecx +/* Now: + case ecx_shift >= edx_shift: + ecx = alignment_16(ecx) + (ecx_shift - edx_shift) + 16 + case ecx_shift < edx_shift: + ecx = alignment_16(ecx) + (16 + ecx_shift - edx_shift) +*/ + mov %ecx, %eax + and $0xf, %eax +/* Now: + case ecx_shift >= edx_shift: eax = ecx_shift - edx_shift + case ecx_shift < edx_shift: eax = (16 + ecx_shift - edx_shift) + eax can be 0, 1, ..., 15 +*/ + mov $0, %esi + +/* case: ecx_shift == edx_shift */ + + jz L(Align16Both) + + cmp $8, %eax + jae L(ShlHigh8) + cmp $1, %eax + je L(Shl1) + cmp $2, %eax + je L(Shl2) + cmp $3, %eax + je L(Shl3) + cmp $4, %eax + je L(Shl4) + cmp $5, %eax + je L(Shl5) + cmp $6, %eax + je L(Shl6) + jmp L(Shl7) + +L(ShlHigh8): + je L(Shl8) + cmp $9, %eax + je L(Shl9) + cmp $10, %eax + je L(Shl10) + cmp $11, %eax + je L(Shl11) + cmp $12, %eax + je L(Shl12) + cmp $13, %eax + je L(Shl13) + cmp $14, %eax + je L(Shl14) + jmp L(Shl15) + +L(Align16Both): + movaps (%ecx), %xmm1 + movaps 16(%ecx), %xmm2 + movaps %xmm1, (%edx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm3 + movaps %xmm2, (%edx, %esi) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm4 + movaps %xmm3, (%edx, %esi) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm1 + movaps %xmm4, (%edx, %esi) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm2 + movaps %xmm1, (%edx, %esi) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm3 + movaps %xmm2, (%edx, %esi) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps %xmm3, (%edx, %esi) + mov %ecx, %eax + lea 16(%ecx, %esi), %ecx + and $-0x40, %ecx + sub %ecx, %eax + sub %eax, %edx +# ifdef USE_AS_STRNCPY + lea 48+64(%ebx, %eax), %ebx +# endif + mov $-0x40, %esi + +L(Aligned64Loop): + movaps (%ecx), %xmm2 + movaps 32(%ecx), %xmm3 + movaps %xmm2, %xmm4 + movaps 16(%ecx), %xmm5 + movaps %xmm3, %xmm6 + movaps 48(%ecx), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + lea 64(%edx), %edx + pcmpeqb %xmm0, %xmm3 + lea 64(%ecx), %ecx + pmovmskb %xmm3, %eax +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeaveCase2OrCase3) +# endif + test %eax, %eax + jnz L(Aligned64Leave) + movaps %xmm4, -64(%edx) + movaps %xmm5, -48(%edx) + movaps %xmm6, -32(%edx) + movaps %xmm7, -16(%edx) + jmp L(Aligned64Loop) + +L(Aligned64Leave): +# ifdef USE_AS_STRNCPY + lea 48(%ebx), %ebx +# endif + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm5, %xmm0 +# ifdef USE_AS_STRNCPY + lea -16(%ebx), %ebx +# endif + pmovmskb %xmm0, %eax + movaps %xmm4, -64(%edx) + test %eax, %eax + lea 16(%esi), %esi + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm6, %xmm0 +# ifdef USE_AS_STRNCPY + lea -16(%ebx), %ebx +# endif + pmovmskb %xmm0, %eax + movaps %xmm5, -48(%edx) + test %eax, %eax + lea 16(%esi), %esi + jnz L(CopyFrom1To16Bytes) + + movaps %xmm6, -32(%edx) + pcmpeqb %xmm7, %xmm0 +# ifdef USE_AS_STRNCPY + lea -16(%ebx), %ebx +# endif + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl1): + movaps -1(%ecx), %xmm1 + movaps 15(%ecx), %xmm2 +L(Shl1Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl1LoopExit) + + palignr $1, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 31(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl1LoopExit) + + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 31(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl1LoopExit) + + palignr $1, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 31(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl1LoopExit) + + palignr $1, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + lea 31(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -15(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -1(%ecx), %xmm1 + +L(Shl1LoopStart): + movaps 15(%ecx), %xmm2 + movaps 31(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 47(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 63(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $1, %xmm4, %xmm5 + test %eax, %eax + palignr $1, %xmm3, %xmm4 + jnz L(Shl1Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave1) +# endif + palignr $1, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $1, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl1LoopStart) + +L(Shl1LoopExit): + movaps (%edx), %xmm6 + psrldq $15, %xmm6 + mov $15, %esi + palignr $1, %xmm1, %xmm6 + movaps %xmm6, (%edx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl2): + movaps -2(%ecx), %xmm1 + movaps 14(%ecx), %xmm2 +L(Shl2Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl2LoopExit) + + palignr $2, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 30(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl2LoopExit) + + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 30(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl2LoopExit) + + palignr $2, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 30(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl2LoopExit) + + palignr $2, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + lea 30(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -14(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -2(%ecx), %xmm1 + +L(Shl2LoopStart): + movaps 14(%ecx), %xmm2 + movaps 30(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 46(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 62(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $2, %xmm4, %xmm5 + test %eax, %eax + palignr $2, %xmm3, %xmm4 + jnz L(Shl2Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave2) +# endif + palignr $2, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $2, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl2LoopStart) + +L(Shl2LoopExit): + movaps (%edx), %xmm6 + psrldq $14, %xmm6 + mov $14, %esi + palignr $2, %xmm1, %xmm6 + movaps %xmm6, (%edx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl3): + movaps -3(%ecx), %xmm1 + movaps 13(%ecx), %xmm2 +L(Shl3Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl3LoopExit) + + palignr $3, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 29(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl3LoopExit) + + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 29(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl3LoopExit) + + palignr $3, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 29(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl3LoopExit) + + palignr $3, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + lea 29(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -13(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -3(%ecx), %xmm1 + +L(Shl3LoopStart): + movaps 13(%ecx), %xmm2 + movaps 29(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 45(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 61(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $3, %xmm4, %xmm5 + test %eax, %eax + palignr $3, %xmm3, %xmm4 + jnz L(Shl3Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave3) +# endif + palignr $3, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $3, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl3LoopStart) + +L(Shl3LoopExit): + movaps (%edx), %xmm6 + psrldq $13, %xmm6 + mov $13, %esi + palignr $3, %xmm1, %xmm6 + movaps %xmm6, (%edx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl4): + movaps -4(%ecx), %xmm1 + movaps 12(%ecx), %xmm2 +L(Shl4Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + lea 28(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -12(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -4(%ecx), %xmm1 + +L(Shl4LoopStart): + movaps 12(%ecx), %xmm2 + movaps 28(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 44(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 60(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $4, %xmm4, %xmm5 + test %eax, %eax + palignr $4, %xmm3, %xmm4 + jnz L(Shl4Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave4) +# endif + palignr $4, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $4, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl4LoopStart) + +L(Shl4LoopExit): + movaps (%edx), %xmm6 + psrldq $12, %xmm6 + mov $12, %esi + palignr $4, %xmm1, %xmm6 + movaps %xmm6, (%edx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl5): + movaps -5(%ecx), %xmm1 + movaps 11(%ecx), %xmm2 +L(Shl5Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl5LoopExit) + + palignr $5, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 27(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl5LoopExit) + + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 27(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl5LoopExit) + + palignr $5, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 27(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl5LoopExit) + + palignr $5, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + lea 27(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -11(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -5(%ecx), %xmm1 + +L(Shl5LoopStart): + movaps 11(%ecx), %xmm2 + movaps 27(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 43(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 59(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $5, %xmm4, %xmm5 + test %eax, %eax + palignr $5, %xmm3, %xmm4 + jnz L(Shl5Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave5) +# endif + palignr $5, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $5, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl5LoopStart) + +L(Shl5LoopExit): + movaps (%edx), %xmm6 + psrldq $11, %xmm6 + mov $11, %esi + palignr $5, %xmm1, %xmm6 + movaps %xmm6, (%edx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl6): + movaps -6(%ecx), %xmm1 + movaps 10(%ecx), %xmm2 +L(Shl6Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl6LoopExit) + + palignr $6, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 26(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl6LoopExit) + + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 26(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl6LoopExit) + + palignr $6, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 26(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl6LoopExit) + + palignr $6, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + lea 26(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -10(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -6(%ecx), %xmm1 + +L(Shl6LoopStart): + movaps 10(%ecx), %xmm2 + movaps 26(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 42(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 58(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $6, %xmm4, %xmm5 + test %eax, %eax + palignr $6, %xmm3, %xmm4 + jnz L(Shl6Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave6) +# endif + palignr $6, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $6, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl6LoopStart) + +L(Shl6LoopExit): + movaps (%edx), %xmm6 + psrldq $10, %xmm6 + mov $10, %esi + palignr $6, %xmm1, %xmm6 + movaps %xmm6, (%edx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl7): + movaps -7(%ecx), %xmm1 + movaps 9(%ecx), %xmm2 +L(Shl7Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl7LoopExit) + + palignr $7, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 25(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl7LoopExit) + + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 25(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl7LoopExit) + + palignr $7, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 25(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl7LoopExit) + + palignr $7, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + lea 25(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -9(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -7(%ecx), %xmm1 + +L(Shl7LoopStart): + movaps 9(%ecx), %xmm2 + movaps 25(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 41(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 57(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $7, %xmm4, %xmm5 + test %eax, %eax + palignr $7, %xmm3, %xmm4 + jnz L(Shl7Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave7) +# endif + palignr $7, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $7, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl7LoopStart) + +L(Shl7LoopExit): + movaps (%edx), %xmm6 + psrldq $9, %xmm6 + mov $9, %esi + palignr $7, %xmm1, %xmm6 + movaps %xmm6, (%edx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl8): + movaps -8(%ecx), %xmm1 + movaps 8(%ecx), %xmm2 +L(Shl8Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + lea 24(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -8(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -8(%ecx), %xmm1 + +L(Shl8LoopStart): + movaps 8(%ecx), %xmm2 + movaps 24(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 40(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 56(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $8, %xmm4, %xmm5 + test %eax, %eax + palignr $8, %xmm3, %xmm4 + jnz L(Shl8Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave8) +# endif + palignr $8, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $8, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl8LoopStart) + +L(Shl8LoopExit): + movaps (%edx), %xmm6 + psrldq $8, %xmm6 + mov $8, %esi + palignr $8, %xmm1, %xmm6 + movaps %xmm6, (%edx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl9): + movaps -9(%ecx), %xmm1 + movaps 7(%ecx), %xmm2 +L(Shl9Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl9LoopExit) + + palignr $9, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 23(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl9LoopExit) + + palignr $9, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 23(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl9LoopExit) + + palignr $9, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 23(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl9LoopExit) + + palignr $9, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + lea 23(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -7(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -9(%ecx), %xmm1 + +L(Shl9LoopStart): + movaps 7(%ecx), %xmm2 + movaps 23(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 39(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 55(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $9, %xmm4, %xmm5 + test %eax, %eax + palignr $9, %xmm3, %xmm4 + jnz L(Shl9Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave9) +# endif + palignr $9, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $9, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl9LoopStart) + +L(Shl9LoopExit): + movaps (%edx), %xmm6 + psrldq $7, %xmm6 + mov $7, %esi + palignr $9, %xmm1, %xmm6 + movaps %xmm6, (%edx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl10): + movaps -10(%ecx), %xmm1 + movaps 6(%ecx), %xmm2 +L(Shl10Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl10LoopExit) + + palignr $10, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 22(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl10LoopExit) + + palignr $10, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 22(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl10LoopExit) + + palignr $10, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 22(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl10LoopExit) + + palignr $10, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + lea 22(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -6(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -10(%ecx), %xmm1 + +L(Shl10LoopStart): + movaps 6(%ecx), %xmm2 + movaps 22(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 38(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 54(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $10, %xmm4, %xmm5 + test %eax, %eax + palignr $10, %xmm3, %xmm4 + jnz L(Shl10Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave10) +# endif + palignr $10, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $10, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl10LoopStart) + +L(Shl10LoopExit): + movaps (%edx), %xmm6 + psrldq $6, %xmm6 + mov $6, %esi + palignr $10, %xmm1, %xmm6 + movaps %xmm6, (%edx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl11): + movaps -11(%ecx), %xmm1 + movaps 5(%ecx), %xmm2 +L(Shl11Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl11LoopExit) + + palignr $11, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 21(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl11LoopExit) + + palignr $11, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 21(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl11LoopExit) + + palignr $11, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 21(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl11LoopExit) + + palignr $11, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + lea 21(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -5(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -11(%ecx), %xmm1 + +L(Shl11LoopStart): + movaps 5(%ecx), %xmm2 + movaps 21(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 37(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 53(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $11, %xmm4, %xmm5 + test %eax, %eax + palignr $11, %xmm3, %xmm4 + jnz L(Shl11Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave11) +# endif + palignr $11, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $11, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl11LoopStart) + +L(Shl11LoopExit): + movaps (%edx), %xmm6 + psrldq $5, %xmm6 + mov $5, %esi + palignr $11, %xmm1, %xmm6 + movaps %xmm6, (%edx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl12): + movaps -12(%ecx), %xmm1 + movaps 4(%ecx), %xmm2 +L(Shl12Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + lea 20(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -4(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -12(%ecx), %xmm1 + +L(Shl12LoopStart): + movaps 4(%ecx), %xmm2 + movaps 20(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 36(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 52(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $12, %xmm4, %xmm5 + test %eax, %eax + palignr $12, %xmm3, %xmm4 + jnz L(Shl12Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave12) +# endif + palignr $12, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $12, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl12LoopStart) + +L(Shl12LoopExit): + movaps (%edx), %xmm6 + psrldq $4, %xmm6 + mov $4, %esi + palignr $12, %xmm1, %xmm6 + movaps %xmm6, (%edx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl13): + movaps -13(%ecx), %xmm1 + movaps 3(%ecx), %xmm2 +L(Shl13Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl13LoopExit) + + palignr $13, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 19(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl13LoopExit) + + palignr $13, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 19(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl13LoopExit) + + palignr $13, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 19(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl13LoopExit) + + palignr $13, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + lea 19(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -3(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -13(%ecx), %xmm1 + +L(Shl13LoopStart): + movaps 3(%ecx), %xmm2 + movaps 19(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 35(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 51(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $13, %xmm4, %xmm5 + test %eax, %eax + palignr $13, %xmm3, %xmm4 + jnz L(Shl13Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave13) +# endif + palignr $13, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $13, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl13LoopStart) + +L(Shl13LoopExit): + movaps (%edx), %xmm6 + psrldq $3, %xmm6 + mov $3, %esi + palignr $13, %xmm1, %xmm6 + movaps %xmm6, (%edx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl14): + movaps -14(%ecx), %xmm1 + movaps 2(%ecx), %xmm2 +L(Shl14Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl14LoopExit) + + palignr $14, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 18(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl14LoopExit) + + palignr $14, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 18(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl14LoopExit) + + palignr $14, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 18(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl14LoopExit) + + palignr $14, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + lea 18(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -2(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -14(%ecx), %xmm1 + +L(Shl14LoopStart): + movaps 2(%ecx), %xmm2 + movaps 18(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 34(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 50(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $14, %xmm4, %xmm5 + test %eax, %eax + palignr $14, %xmm3, %xmm4 + jnz L(Shl14Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave14) +# endif + palignr $14, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $14, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl14LoopStart) + +L(Shl14LoopExit): + movaps (%edx), %xmm6 + psrldq $2, %xmm6 + mov $2, %esi + palignr $14, %xmm1, %xmm6 + movaps %xmm6, (%edx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl15): + movaps -15(%ecx), %xmm1 + movaps 1(%ecx), %xmm2 +L(Shl15Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl15LoopExit) + + palignr $15, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 17(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl15LoopExit) + + palignr $15, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 17(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl15LoopExit) + + palignr $15, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 17(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl15LoopExit) + + palignr $15, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + lea 17(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -1(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -15(%ecx), %xmm1 + +L(Shl15LoopStart): + movaps 1(%ecx), %xmm2 + movaps 17(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 33(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 49(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $15, %xmm4, %xmm5 + test %eax, %eax + palignr $15, %xmm3, %xmm4 + jnz L(Shl15Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave15) +# endif + palignr $15, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $15, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl15LoopStart) + +L(Shl15LoopExit): + movaps (%edx), %xmm6 + psrldq $1, %xmm6 + mov $1, %esi + palignr $15, %xmm1, %xmm6 + movaps %xmm6, (%edx) + + .p2align 4 +L(CopyFrom1To16Bytes): +# ifdef USE_AS_STRNCPY + add $16, %ebx +# endif + add %esi, %edx + add %esi, %ecx + + POP (%esi) + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + test $0x08, %al + jnz L(Exit4) + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + + .p2align 4 +L(Exit8): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) +# ifdef USE_AS_STPCPY + lea 7(%edx), %eax +# else + movl %edi, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $8, %ebx + lea 8(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + test $0x08, %ah + jnz L(Exit12) + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + + .p2align 4 +L(Exit16): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 8(%ecx), %xmm0 + movlpd %xmm0, 8(%edx) +# ifdef USE_AS_STPCPY + lea 15(%edx), %eax +# else + movl %edi, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $16, %ebx + lea 16(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + +# ifdef USE_AS_STRNCPY + + CFI_PUSH(%esi) + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %ebx + add %esi, %ecx + lea (%esi, %edx), %esi + lea -9(%ebx), %edx + and $1<<7, %dh + or %al, %dh + test %dh, %dh + lea (%esi), %edx + POP (%esi) + jz L(ExitHighCase2) + + cmp $1, %ebx + je L(Exit1) + test $0x01, %al + jnz L(Exit1) + cmp $2, %ebx + je L(Exit2) + test $0x02, %al + jnz L(Exit2) + cmp $3, %ebx + je L(Exit3) + test $0x04, %al + jnz L(Exit3) + cmp $4, %ebx + je L(Exit4) + test $0x08, %al + jnz L(Exit4) + cmp $5, %ebx + je L(Exit5) + test $0x10, %al + jnz L(Exit5) + cmp $6, %ebx + je L(Exit6) + test $0x20, %al + jnz L(Exit6) + cmp $7, %ebx + je L(Exit7) + test $0x40, %al + jnz L(Exit7) + jmp L(Exit8) + + .p2align 4 +L(ExitHighCase2): + cmp $9, %ebx + je L(Exit9) + test $0x01, %ah + jnz L(Exit9) + cmp $10, %ebx + je L(Exit10) + test $0x02, %ah + jnz L(Exit10) + cmp $11, %ebx + je L(Exit11) + test $0x04, %ah + jnz L(Exit11) + cmp $12, %ebx + je L(Exit12) + test $0x8, %ah + jnz L(Exit12) + cmp $13, %ebx + je L(Exit13) + test $0x10, %ah + jnz L(Exit13) + cmp $14, %ebx + je L(Exit14) + test $0x20, %ah + jnz L(Exit14) + cmp $15, %ebx + je L(Exit15) + test $0x40, %ah + jnz L(Exit15) + jmp L(Exit16) + + CFI_PUSH(%esi) + +L(CopyFrom1To16BytesCase2OrCase3): + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + + .p2align 4 +L(CopyFrom1To16BytesCase3): + add $16, %ebx + add %esi, %edx + add %esi, %ecx + + POP (%esi) + cmp $16, %ebx + je L(Exit16) + cmp $8, %ebx + je L(Exit8) + jg L(More8Case3) + cmp $4, %ebx + je L(Exit4) + jg L(More4Case3) + cmp $2, %ebx + jl L(Exit1) + je L(Exit2) + jg L(Exit3) +L(More8Case3): /* but less than 16 */ + cmp $12, %ebx + je L(Exit12) + jl L(Less12Case3) + cmp $14, %ebx + jl L(Exit13) + je L(Exit14) + jg L(Exit15) +L(More4Case3): /* but less than 8 */ + cmp $6, %ebx + jl L(Exit5) + je L(Exit6) + jg L(Exit7) +L(Less12Case3): /* but more than 8 */ + cmp $10, %ebx + jl L(Exit9) + je L(Exit10) + jg L(Exit11) +# endif + + .p2align 4 +L(Exit1): + movb (%ecx), %al + movb %al, (%edx) +# ifdef USE_AS_STPCPY + lea (%edx), %eax +# else + movl %edi, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $1, %ebx + lea 1(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit2): + movw (%ecx), %ax + movw %ax, (%edx) +# ifdef USE_AS_STPCPY + lea 1(%edx), %eax +# else + movl %edi, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $2, %ebx + lea 2(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit3): + movw (%ecx), %ax + movw %ax, (%edx) + movb 2(%ecx), %al + movb %al, 2(%edx) +# ifdef USE_AS_STPCPY + lea 2(%edx), %eax +# else + movl %edi, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $3, %ebx + lea 3(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit4): + movl (%ecx), %eax + movl %eax, (%edx) +# ifdef USE_AS_STPCPY + lea 3(%edx), %eax +# else + movl %edi, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $4, %ebx + lea 4(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit5): + movl (%ecx), %eax + movl %eax, (%edx) + movb 4(%ecx), %al + movb %al, 4(%edx) +# ifdef USE_AS_STPCPY + lea 4(%edx), %eax +# else + movl %edi, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $5, %ebx + lea 5(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit6): + movl (%ecx), %eax + movl %eax, (%edx) + movw 4(%ecx), %ax + movw %ax, 4(%edx) +# ifdef USE_AS_STPCPY + lea 5(%edx), %eax +# else + movl %edi, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $6, %ebx + lea 6(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit7): + movl (%ecx), %eax + movl %eax, (%edx) + movl 3(%ecx), %eax + movl %eax, 3(%edx) +# ifdef USE_AS_STPCPY + lea 6(%edx), %eax +# else + movl %edi, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $7, %ebx + lea 7(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit9): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movb 8(%ecx), %al + movb %al, 8(%edx) +# ifdef USE_AS_STPCPY + lea 8(%edx), %eax +# else + movl %edi, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $9, %ebx + lea 9(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit10): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movw 8(%ecx), %ax + movw %ax, 8(%edx) +# ifdef USE_AS_STPCPY + lea 9(%edx), %eax +# else + movl %edi, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $10, %ebx + lea 10(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit11): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl 7(%ecx), %eax + movl %eax, 7(%edx) +# ifdef USE_AS_STPCPY + lea 10(%edx), %eax +# else + movl %edi, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $11, %ebx + lea 11(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit12): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl 8(%ecx), %eax + movl %eax, 8(%edx) +# ifdef USE_AS_STPCPY + lea 11(%edx), %eax +# else + movl %edi, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $12, %ebx + lea 12(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit13): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 5(%ecx), %xmm0 + movlpd %xmm0, 5(%edx) +# ifdef USE_AS_STPCPY + lea 12(%edx), %eax +# else + movl %edi, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $13, %ebx + lea 13(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit14): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 6(%ecx), %xmm0 + movlpd %xmm0, 6(%edx) +# ifdef USE_AS_STPCPY + lea 13(%edx), %eax +# else + movl %edi, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $14, %ebx + lea 14(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit15): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 7(%ecx), %xmm0 + movlpd %xmm0, 7(%edx) +# ifdef USE_AS_STPCPY + lea 14(%edx), %eax +# else + movl %edi, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $15, %ebx + lea 15(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + +CFI_POP (%edi) + +# ifdef USE_AS_STRNCPY + .p2align 4 +L(Fill0): + RETURN + + .p2align 4 +L(Fill1): + movb %dl, (%ecx) + RETURN + + .p2align 4 +L(Fill2): + movw %dx, (%ecx) + RETURN + + .p2align 4 +L(Fill3): + movw %dx, (%ecx) + movb %dl, 2(%ecx) + RETURN + + .p2align 4 +L(Fill4): + movl %edx, (%ecx) + RETURN + + .p2align 4 +L(Fill5): + movl %edx, (%ecx) + movb %dl, 4(%ecx) + RETURN + + .p2align 4 +L(Fill6): + movl %edx, (%ecx) + movw %dx, 4(%ecx) + RETURN + + .p2align 4 +L(Fill7): + movl %edx, (%ecx) + movl %edx, 3(%ecx) + RETURN + + .p2align 4 +L(Fill8): + movlpd %xmm0, (%ecx) + RETURN + + .p2align 4 +L(Fill9): + movlpd %xmm0, (%ecx) + movb %dl, 8(%ecx) + RETURN + + .p2align 4 +L(Fill10): + movlpd %xmm0, (%ecx) + movw %dx, 8(%ecx) + RETURN + + .p2align 4 +L(Fill11): + movlpd %xmm0, (%ecx) + movl %edx, 7(%ecx) + RETURN + + .p2align 4 +L(Fill12): + movlpd %xmm0, (%ecx) + movl %edx, 8(%ecx) + RETURN + + .p2align 4 +L(Fill13): + movlpd %xmm0, (%ecx) + movlpd %xmm0, 5(%ecx) + RETURN + + .p2align 4 +L(Fill14): + movlpd %xmm0, (%ecx) + movlpd %xmm0, 6(%ecx) + RETURN + + .p2align 4 +L(Fill15): + movlpd %xmm0, (%ecx) + movlpd %xmm0, 7(%ecx) + RETURN + + .p2align 4 +L(Fill16): + movlpd %xmm0, (%ecx) + movlpd %xmm0, 8(%ecx) + RETURN + + .p2align 4 +L(StrncpyFillExit1): + lea 16(%ebx), %ebx +L(FillFrom1To16Bytes): + test %ebx, %ebx + jz L(Fill0) + cmp $16, %ebx + je L(Fill16) + cmp $8, %ebx + je L(Fill8) + jg L(FillMore8) + cmp $4, %ebx + je L(Fill4) + jg L(FillMore4) + cmp $2, %ebx + jl L(Fill1) + je L(Fill2) + jg L(Fill3) +L(FillMore8): /* but less than 16 */ + cmp $12, %ebx + je L(Fill12) + jl L(FillLess12) + cmp $14, %ebx + jl L(Fill13) + je L(Fill14) + jg L(Fill15) +L(FillMore4): /* but less than 8 */ + cmp $6, %ebx + jl L(Fill5) + je L(Fill6) + jg L(Fill7) +L(FillLess12): /* but more than 8 */ + cmp $10, %ebx + jl L(Fill9) + je L(Fill10) + jmp L(Fill11) + + CFI_PUSH(%edi) + + .p2align 4 +L(StrncpyFillTailWithZero1): + POP (%edi) +L(StrncpyFillTailWithZero): + pxor %xmm0, %xmm0 + xor %edx, %edx + sub $16, %ebx + jbe L(StrncpyFillExit1) + + movlpd %xmm0, (%ecx) + movlpd %xmm0, 8(%ecx) + + lea 16(%ecx), %ecx + + mov %ecx, %edx + and $0xf, %edx + sub %edx, %ecx + add %edx, %ebx + xor %edx, %edx + sub $64, %ebx + jb L(StrncpyFillLess64) + +L(StrncpyFillLoopMovdqa): + movdqa %xmm0, (%ecx) + movdqa %xmm0, 16(%ecx) + movdqa %xmm0, 32(%ecx) + movdqa %xmm0, 48(%ecx) + lea 64(%ecx), %ecx + sub $64, %ebx + jae L(StrncpyFillLoopMovdqa) + +L(StrncpyFillLess64): + add $32, %ebx + jl L(StrncpyFillLess32) + movdqa %xmm0, (%ecx) + movdqa %xmm0, 16(%ecx) + lea 32(%ecx), %ecx + sub $16, %ebx + jl L(StrncpyFillExit1) + movdqa %xmm0, (%ecx) + lea 16(%ecx), %ecx + jmp L(FillFrom1To16Bytes) + +L(StrncpyFillLess32): + add $16, %ebx + jl L(StrncpyFillExit1) + movdqa %xmm0, (%ecx) + lea 16(%ecx), %ecx + jmp L(FillFrom1To16Bytes) +# endif + + .p2align 4 +L(ExitTail1): + movb (%ecx), %al + movb %al, (%edx) +# ifdef USE_AS_STPCPY + lea (%edx), %eax +# else + movl %edx, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $1, %ebx + lea 1(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail2): + movw (%ecx), %ax + movw %ax, (%edx) +# ifdef USE_AS_STPCPY + lea 1(%edx), %eax +# else + movl %edx, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $2, %ebx + lea 2(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail3): + movw (%ecx), %ax + movw %ax, (%edx) + movb 2(%ecx), %al + movb %al, 2(%edx) +# ifdef USE_AS_STPCPY + lea 2(%edx), %eax +# else + movl %edx, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $3, %ebx + lea 3(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail4): + movl (%ecx), %eax + movl %eax, (%edx) +# ifdef USE_AS_STPCPY + lea 3(%edx), %eax +# else + movl %edx, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $4, %ebx + lea 4(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail5): + movl (%ecx), %eax + movl %eax, (%edx) + movb 4(%ecx), %al + movb %al, 4(%edx) +# ifdef USE_AS_STPCPY + lea 4(%edx), %eax +# else + movl %edx, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $5, %ebx + lea 5(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail6): + movl (%ecx), %eax + movl %eax, (%edx) + movw 4(%ecx), %ax + movw %ax, 4(%edx) +# ifdef USE_AS_STPCPY + lea 5(%edx), %eax +# else + movl %edx, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $6, %ebx + lea 6(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail7): + movl (%ecx), %eax + movl %eax, (%edx) + movl 3(%ecx), %eax + movl %eax, 3(%edx) +# ifdef USE_AS_STPCPY + lea 6(%edx), %eax +# else + movl %edx, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $7, %ebx + lea 7(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail9): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movb 8(%ecx), %al + movb %al, 8(%edx) +# ifdef USE_AS_STPCPY + lea 8(%edx), %eax +# else + movl %edx, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $9, %ebx + lea 9(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail10): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movw 8(%ecx), %ax + movw %ax, 8(%edx) +# ifdef USE_AS_STPCPY + lea 9(%edx), %eax +# else + movl %edx, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $10, %ebx + lea 10(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail11): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl 7(%ecx), %eax + movl %eax, 7(%edx) +# ifdef USE_AS_STPCPY + lea 10(%edx), %eax +# else + movl %edx, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $11, %ebx + lea 11(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail12): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl 8(%ecx), %eax + movl %eax, 8(%edx) +# ifdef USE_AS_STPCPY + lea 11(%edx), %eax +# else + movl %edx, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $12, %ebx + lea 12(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail13): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 5(%ecx), %xmm0 + movlpd %xmm0, 5(%edx) +# ifdef USE_AS_STPCPY + lea 12(%edx), %eax +# else + movl %edx, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $13, %ebx + lea 13(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail14): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 6(%ecx), %xmm0 + movlpd %xmm0, 6(%edx) +# ifdef USE_AS_STPCPY + lea 13(%edx), %eax +# else + movl %edx, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $14, %ebx + lea 14(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail16): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 8(%ecx), %xmm0 + movlpd %xmm0, 8(%edx) +# ifdef USE_AS_STPCPY + lea 15(%edx), %eax +# else + movl %edx, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $16, %ebx + lea 16(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN +# ifdef USE_AS_STRNCPY +L(StrncpyLeaveCase2OrCase3): + test %eax, %eax + jnz L(Aligned64LeaveCase2) + +L(Aligned64LeaveCase3): + add $48, %ebx + jle L(CopyFrom1To16BytesCase3) + movaps %xmm4, -64(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase3) + movaps %xmm5, -48(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase3) + movaps %xmm6, -32(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx + jmp L(CopyFrom1To16BytesCase3) + +L(Aligned64LeaveCase2): + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %eax + add $48, %ebx + jle L(CopyFrom1To16BytesCase2OrCase3) + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm4, -64(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm5, -48(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm6, -32(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx + jmp L(CopyFrom1To16BytesCase2) +/* -------------------------------------------------- */ +L(StrncpyExit1Case2OrCase3): + movaps (%edx), %xmm6 + psrldq $15, %xmm6 + mov $15, %esi + palignr $1, %xmm1, %xmm6 + movaps %xmm6, (%edx) + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit2Case2OrCase3): + movaps (%edx), %xmm6 + psrldq $14, %xmm6 + mov $14, %esi + palignr $2, %xmm1, %xmm6 + movaps %xmm6, (%edx) + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit3Case2OrCase3): + movaps (%edx), %xmm6 + psrldq $13, %xmm6 + mov $13, %esi + palignr $3, %xmm1, %xmm6 + movaps %xmm6, (%edx) + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit4Case2OrCase3): + movaps (%edx), %xmm6 + psrldq $12, %xmm6 + mov $12, %esi + palignr $4, %xmm1, %xmm6 + movaps %xmm6, (%edx) + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit5Case2OrCase3): + movaps (%edx), %xmm6 + psrldq $11, %xmm6 + mov $11, %esi + palignr $5, %xmm1, %xmm6 + movaps %xmm6, (%edx) + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit6Case2OrCase3): + movaps (%edx), %xmm6 + psrldq $10, %xmm6 + mov $10, %esi + palignr $6, %xmm1, %xmm6 + movaps %xmm6, (%edx) + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit7Case2OrCase3): + movaps (%edx), %xmm6 + psrldq $9, %xmm6 + mov $9, %esi + palignr $7, %xmm1, %xmm6 + movaps %xmm6, (%edx) + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit8Case2OrCase3): + movaps (%edx), %xmm6 + psrldq $8, %xmm6 + mov $8, %esi + palignr $8, %xmm1, %xmm6 + movaps %xmm6, (%edx) + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit9Case2OrCase3): + movaps (%edx), %xmm6 + psrldq $7, %xmm6 + mov $7, %esi + palignr $9, %xmm1, %xmm6 + movaps %xmm6, (%edx) + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit10Case2OrCase3): + movaps (%edx), %xmm6 + psrldq $6, %xmm6 + mov $6, %esi + palignr $10, %xmm1, %xmm6 + movaps %xmm6, (%edx) + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit11Case2OrCase3): + movaps (%edx), %xmm6 + psrldq $5, %xmm6 + mov $5, %esi + palignr $11, %xmm1, %xmm6 + movaps %xmm6, (%edx) + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit12Case2OrCase3): + movaps (%edx), %xmm6 + psrldq $4, %xmm6 + mov $4, %esi + palignr $12, %xmm1, %xmm6 + movaps %xmm6, (%edx) + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit13Case2OrCase3): + movaps (%edx), %xmm6 + psrldq $3, %xmm6 + mov $3, %esi + palignr $13, %xmm1, %xmm6 + movaps %xmm6, (%edx) + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit14Case2OrCase3): + movaps (%edx), %xmm6 + psrldq $2, %xmm6 + mov $2, %esi + palignr $14, %xmm1, %xmm6 + movaps %xmm6, (%edx) + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit15Case2OrCase3): + movaps (%edx), %xmm6 + psrldq $1, %xmm6 + mov $1, %esi + palignr $15, %xmm1, %xmm6 + movaps %xmm6, (%edx) + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave1): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit1) + palignr $1, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 31(%ecx), %xmm2 + lea 16(%esi), %esi + movaps %xmm2, %xmm3 + sub $16, %ebx + jbe L(StrncpyExit1) + palignr $1, %xmm1, %xmm2 + movaps %xmm2, 16(%edx) + movaps 31+16(%ecx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit1) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit1) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx + +L(StrncpyExit1): + movaps (%edx, %esi), %xmm6 + psrldq $15, %xmm6 + palignr $1, %xmm1, %xmm6 + movaps %xmm6, (%edx, %esi) + lea 15(%esi), %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave2): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit2) + palignr $2, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 30(%ecx), %xmm2 + lea 16(%esi), %esi + movaps %xmm2, %xmm3 + sub $16, %ebx + jbe L(StrncpyExit2) + palignr $2, %xmm1, %xmm2 + movaps %xmm2, 16(%edx) + movaps 30+16(%ecx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit2) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit2) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx + +L(StrncpyExit2): + movaps (%edx, %esi), %xmm6 + psrldq $14, %xmm6 + palignr $2, %xmm1, %xmm6 + movaps %xmm6, (%edx, %esi) + lea 14(%esi), %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave3): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit3) + palignr $3, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 29(%ecx), %xmm2 + lea 16(%esi), %esi + movaps %xmm2, %xmm3 + sub $16, %ebx + jbe L(StrncpyExit3) + palignr $3, %xmm1, %xmm2 + movaps %xmm2, 16(%edx) + movaps 29+16(%ecx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit3) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit3) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx + +L(StrncpyExit3): + movaps (%edx, %esi), %xmm6 + psrldq $13, %xmm6 + palignr $3, %xmm1, %xmm6 + movaps %xmm6, (%edx, %esi) + lea 13(%esi), %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave4): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit4) + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + lea 16(%esi), %esi + movaps %xmm2, %xmm3 + sub $16, %ebx + jbe L(StrncpyExit4) + palignr $4, %xmm1, %xmm2 + movaps %xmm2, 16(%edx) + movaps 28+16(%ecx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit4) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit4) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx + +L(StrncpyExit4): + movaps (%edx, %esi), %xmm6 + psrldq $12, %xmm6 + palignr $4, %xmm1, %xmm6 + movaps %xmm6, (%edx, %esi) + lea 12(%esi), %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave5): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit5) + palignr $5, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 27(%ecx), %xmm2 + lea 16(%esi), %esi + movaps %xmm2, %xmm3 + sub $16, %ebx + jbe L(StrncpyExit5) + palignr $5, %xmm1, %xmm2 + movaps %xmm2, 16(%edx) + movaps 27+16(%ecx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit5) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit5) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx + +L(StrncpyExit5): + movaps (%edx, %esi), %xmm6 + psrldq $11, %xmm6 + palignr $5, %xmm1, %xmm6 + movaps %xmm6, (%edx, %esi) + lea 11(%esi), %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave6): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit6) + palignr $6, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 26(%ecx), %xmm2 + lea 16(%esi), %esi + movaps %xmm2, %xmm3 + sub $16, %ebx + jbe L(StrncpyExit6) + palignr $6, %xmm1, %xmm2 + movaps %xmm2, 16(%edx) + movaps 26+16(%ecx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit6) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit6) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx + +L(StrncpyExit6): + movaps (%edx, %esi), %xmm6 + psrldq $10, %xmm6 + palignr $6, %xmm1, %xmm6 + movaps %xmm6, (%edx, %esi) + lea 10(%esi), %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave7): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit7) + palignr $7, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 25(%ecx), %xmm2 + lea 16(%esi), %esi + movaps %xmm2, %xmm3 + sub $16, %ebx + jbe L(StrncpyExit7) + palignr $7, %xmm1, %xmm2 + movaps %xmm2, 16(%edx) + movaps 25+16(%ecx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit7) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit7) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx + +L(StrncpyExit7): + movaps (%edx, %esi), %xmm6 + psrldq $9, %xmm6 + palignr $7, %xmm1, %xmm6 + movaps %xmm6, (%edx, %esi) + lea 9(%esi), %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave8): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit8) + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + lea 16(%esi), %esi + movaps %xmm2, %xmm3 + sub $16, %ebx + jbe L(StrncpyExit8) + palignr $8, %xmm1, %xmm2 + movaps %xmm2, 16(%edx) + movaps 24+16(%ecx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit8) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit8) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx + +L(StrncpyExit8): + movaps (%edx, %esi), %xmm6 + psrldq $8, %xmm6 + palignr $8, %xmm1, %xmm6 + movaps %xmm6, (%edx, %esi) + lea 8(%esi), %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave9): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit9) + palignr $9, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 23(%ecx), %xmm2 + lea 16(%esi), %esi + movaps %xmm2, %xmm3 + sub $16, %ebx + jbe L(StrncpyExit9) + palignr $9, %xmm1, %xmm2 + movaps %xmm2, 16(%edx) + movaps 23+16(%ecx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit9) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit9) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx + +L(StrncpyExit9): + movaps (%edx, %esi), %xmm6 + psrldq $7, %xmm6 + palignr $9, %xmm1, %xmm6 + movaps %xmm6, (%edx, %esi) + lea 7(%esi), %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave10): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit10) + palignr $10, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 22(%ecx), %xmm2 + lea 16(%esi), %esi + movaps %xmm2, %xmm3 + sub $16, %ebx + jbe L(StrncpyExit10) + palignr $10, %xmm1, %xmm2 + movaps %xmm2, 16(%edx) + movaps 22+16(%ecx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit10) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit10) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx + +L(StrncpyExit10): + movaps (%edx, %esi), %xmm6 + psrldq $6, %xmm6 + palignr $10, %xmm1, %xmm6 + movaps %xmm6, (%edx, %esi) + lea 6(%esi), %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave11): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit11) + palignr $11, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 21(%ecx), %xmm2 + lea 16(%esi), %esi + movaps %xmm2, %xmm3 + sub $16, %ebx + jbe L(StrncpyExit11) + palignr $11, %xmm1, %xmm2 + movaps %xmm2, 16(%edx) + movaps 21+16(%ecx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit11) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit11) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx + +L(StrncpyExit11): + movaps (%edx, %esi), %xmm6 + psrldq $5, %xmm6 + palignr $11, %xmm1, %xmm6 + movaps %xmm6, (%edx, %esi) + lea 5(%esi), %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave12): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit12) + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + lea 16(%esi), %esi + movaps %xmm2, %xmm3 + sub $16, %ebx + jbe L(StrncpyExit12) + palignr $12, %xmm1, %xmm2 + movaps %xmm2, 16(%edx) + movaps 20+16(%ecx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit12) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit12) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx + +L(StrncpyExit12): + movaps (%edx, %esi), %xmm6 + psrldq $4, %xmm6 + palignr $12, %xmm1, %xmm6 + movaps %xmm6, (%edx, %esi) + lea 4(%esi), %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave13): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit13) + palignr $13, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 19(%ecx), %xmm2 + lea 16(%esi), %esi + movaps %xmm2, %xmm3 + sub $16, %ebx + jbe L(StrncpyExit13) + palignr $13, %xmm1, %xmm2 + movaps %xmm2, 16(%edx) + movaps 19+16(%ecx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit13) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit13) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx + +L(StrncpyExit13): + movaps (%edx, %esi), %xmm6 + psrldq $3, %xmm6 + palignr $13, %xmm1, %xmm6 + movaps %xmm6, (%edx, %esi) + lea 3(%esi), %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave14): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit14) + palignr $14, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 18(%ecx), %xmm2 + lea 16(%esi), %esi + movaps %xmm2, %xmm3 + sub $16, %ebx + jbe L(StrncpyExit14) + palignr $14, %xmm1, %xmm2 + movaps %xmm2, 16(%edx) + movaps 18+16(%ecx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit14) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit14) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx + +L(StrncpyExit14): + movaps (%edx, %esi), %xmm6 + psrldq $2, %xmm6 + palignr $14, %xmm1, %xmm6 + movaps %xmm6, (%edx, %esi) + lea 2(%esi), %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave15): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit15) + palignr $15, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 17(%ecx), %xmm2 + lea 16(%esi), %esi + movaps %xmm2, %xmm3 + sub $16, %ebx + jbe L(StrncpyExit15) + palignr $15, %xmm1, %xmm2 + movaps %xmm2, 16(%edx) + movaps 17+16(%ecx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit15) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit15) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx + +L(StrncpyExit15): + movaps (%edx, %esi), %xmm6 + psrldq $1, %xmm6 + palignr $15, %xmm1, %xmm6 + movaps %xmm6, (%edx, %esi) + lea 1(%esi), %esi + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(ExitTail0): + movl %edx, %eax + RETURN + + .p2align 4 +L(StrncpyExit15Bytes): + cmp $9, %ebx + je L(ExitTail9) + cmpb $0, 8(%ecx) + jz L(ExitTail9) + cmp $10, %ebx + je L(ExitTail10) + cmpb $0, 9(%ecx) + jz L(ExitTail10) + cmp $11, %ebx + je L(ExitTail11) + cmpb $0, 10(%ecx) + jz L(ExitTail11) + cmp $12, %ebx + je L(ExitTail12) + cmpb $0, 11(%ecx) + jz L(ExitTail12) + cmp $13, %ebx + je L(ExitTail13) + cmpb $0, 12(%ecx) + jz L(ExitTail13) + cmp $14, %ebx + je L(ExitTail14) + cmpb $0, 13(%ecx) + jz L(ExitTail14) +# endif + + .p2align 4 +L(ExitTail15): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 7(%ecx), %xmm0 + movlpd %xmm0, 7(%edx) +# ifdef USE_AS_STPCPY + lea 14(%edx), %eax +# else + movl %edx, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $15, %ebx + lea 15(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + +# ifdef USE_AS_STRNCPY + .p2align 4 +L(StrncpyExit8Bytes): + cmp $1, %ebx + je L(ExitTail1) + cmpb $0, (%ecx) + jz L(ExitTail1) + cmp $2, %ebx + je L(ExitTail2) + cmpb $0, 1(%ecx) + jz L(ExitTail2) + cmp $3, %ebx + je L(ExitTail3) + cmpb $0, 2(%ecx) + jz L(ExitTail3) + cmp $4, %ebx + je L(ExitTail4) + cmpb $0, 3(%ecx) + jz L(ExitTail4) + cmp $5, %ebx + je L(ExitTail5) + cmpb $0, 4(%ecx) + jz L(ExitTail5) + cmp $6, %ebx + je L(ExitTail6) + cmpb $0, 5(%ecx) + jz L(ExitTail6) + cmp $7, %ebx + je L(ExitTail7) + cmpb $0, 6(%ecx) + jz L(ExitTail7) +# endif + .p2align 4 +L(ExitTail8): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) +# ifdef USE_AS_STPCPY + lea 7(%edx), %eax +# else + movl %edx, %eax +# endif +# ifdef USE_AS_STRNCPY + sub $8, %ebx + lea 8(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + +END (STRCPY) + +#endif diff --git a/sysdeps/i386/i686/multiarch/strcpy.S b/sysdeps/i386/i686/multiarch/strcpy.S new file mode 100644 index 0000000..d025a4f --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strcpy.S @@ -0,0 +1,154 @@ +/* Multiple versions of strcpy + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY) +# ifndef STRCPY +# define STRCPY strcpy +# endif +#endif + +#ifdef USE_AS_STPCPY +# ifdef USE_AS_STRNCPY +# define STRCPY_SSSE3 __stpncpy_ssse3 +# define STRCPY_SSE2 __stpncpy_sse2 +# define STRCPY_IA32 __stpncpy_ia32 +# define __GI_STRCPY __GI_stpncpy +# define __GI___STRCPY __GI___stpncpy +# else +# define STRCPY_SSSE3 __stpcpy_ssse3 +# define STRCPY_SSE2 __stpcpy_sse2 +# define STRCPY_IA32 __stpcpy_ia32 +# define __GI_STRCPY __GI_stpcpy +# define __GI___STRCPY __GI___stpcpy +# endif +#else +# ifdef USE_AS_STRNCPY +# define STRCPY_SSSE3 __strncpy_ssse3 +# define STRCPY_SSE2 __strncpy_sse2 +# define STRCPY_IA32 __strncpy_ia32 +# define __GI_STRCPY __GI_strncpy +# else +# define STRCPY_SSSE3 __strcpy_ssse3 +# define STRCPY_SSE2 __strcpy_sse2 +# define STRCPY_IA32 __strcpy_ia32 +# define __GI_STRCPY __GI_strcpy +# endif +#endif + + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strncpy in static library since we + need strncpy before the initialization happened. */ +#ifndef NOT_IN_libc + +# ifdef SHARED + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + + .text +ENTRY(STRCPY) + .type STRCPY, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal STRCPY_IA32@GOTOFF(%ebx), %eax + testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx) + jz 2f + leal STRCPY_SSE2@GOTOFF(%ebx), %eax + testl $bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx) + jnz 2f + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx) + jz 2f + leal STRCPY_SSSE3@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + ret +END(STRCPY) +# else + +ENTRY(STRCPY) + .type STRCPY, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features + jne 1f + call __init_cpu_features +1: leal STRCPY_IA32, %eax + testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features + jz 2f + leal STRCPY_SSE2, %eax + testl $bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features + jnz 2f + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features + jz 2f + leal STRCPY_SSSE3, %eax +2: ret +END(STRCPY) + +# endif + +# undef ENTRY +# define ENTRY(name) \ + .type STRCPY_IA32, @function; \ + .align 16; \ + STRCPY_IA32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCPY_IA32, .-STRCPY_IA32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcpy calls through a PLT. + The speedup we get from using SSSE3 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCPY; __GI_STRCPY = STRCPY_IA32 +# undef libc_hidden_def +# define libc_hidden_def(name) \ + .globl __GI___STRCPY; __GI___STRCPY = STRCPY_IA32 + +# endif +#endif + +#ifdef USE_AS_STPCPY +# ifdef USE_AS_STRNCPY +# include "../../stpncpy.S" +# else +# include "../../i586/stpcpy.S" +# endif +#else +# ifndef USE_AS_STRNCPY +# include "../../i586/strcpy.S" +# endif +#endif diff --git a/sysdeps/i386/i686/multiarch/strncpy-c.c b/sysdeps/i386/i686/multiarch/strncpy-c.c new file mode 100644 index 0000000..201e3f9 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strncpy-c.c @@ -0,0 +1,8 @@ +#define STRNCPY __strncpy_ia32 +#ifdef SHARED +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strncpy_ia32, __GI_strncpy, __strncpy_ia32); +#endif + +#include "string/strncpy.c" diff --git a/sysdeps/i386/i686/multiarch/strncpy-sse2.S b/sysdeps/i386/i686/multiarch/strncpy-sse2.S new file mode 100644 index 0000000..bdd9923 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strncpy-sse2.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCPY +#define STRCPY __strncpy_sse2 +#include "strcpy-sse2.S" diff --git a/sysdeps/i386/i686/multiarch/strncpy-ssse3.S b/sysdeps/i386/i686/multiarch/strncpy-ssse3.S new file mode 100644 index 0000000..bf82ee4 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strncpy-ssse3.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCPY +#define STRCPY __strncpy_ssse3 +#include "strcpy-ssse3.S" diff --git a/sysdeps/i386/i686/multiarch/strncpy.S b/sysdeps/i386/i686/multiarch/strncpy.S new file mode 100644 index 0000000..30a5bd2 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strncpy.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCPY +#define STRCPY strncpy +#include "strcpy.S" diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c index 809d105..81b2378 100644 --- a/sysdeps/x86_64/multiarch/init-arch.c +++ b/sysdeps/x86_64/multiarch/init-arch.c @@ -97,13 +97,18 @@ __init_cpu_features (void) case 0x2c: case 0x2e: case 0x2f: - /* Rep string instructions and copy backward are fast on - Intel Core i3, i5 and i7. */ + /* Rep string instructions, copy backward and unaligned loads + are fast on Intel Core i3, i5 and i7. */ #if index_Fast_Rep_String != index_Fast_Copy_Backward # error index_Fast_Rep_String != index_Fast_Copy_Backward #endif +#if index_Fast_Rep_String != index_Fast_Unaligned_Load +# error index_Fast_Rep_String != index_Fast_Unaligned_Load +#endif __cpu_features.feature[index_Fast_Rep_String] - |= bit_Fast_Rep_String | bit_Fast_Copy_Backward; + |= (bit_Fast_Rep_String + | bit_Fast_Copy_Backward + | bit_Fast_Unaligned_Load); break; } } diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h index 6e409b8..addf5f3 100644 --- a/sysdeps/x86_64/multiarch/init-arch.h +++ b/sysdeps/x86_64/multiarch/init-arch.h @@ -20,6 +20,7 @@ #define bit_Fast_Copy_Backward (1 << 1) #define bit_Slow_BSF (1 << 2) #define bit_Prefer_SSE_for_memop (1 << 3) +#define bit_Fast_Unaligned_Load (1 << 4) #ifdef __ASSEMBLER__ @@ -39,6 +40,7 @@ # define index_Fast_Copy_Backward FEATURE_INDEX_1*FEATURE_SIZE # define index_Slow_BSF FEATURE_INDEX_1*FEATURE_SIZE # define index_Prefer_SSE_for_memop FEATURE_INDEX_1*FEATURE_SIZE +# define index_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE #else /* __ASSEMBLER__ */ @@ -112,6 +114,7 @@ extern const struct cpu_features *__get_cpu_features (void) # define index_Fast_Copy_Backward FEATURE_INDEX_1 # define index_Slow_BSF FEATURE_INDEX_1 # define index_Prefer_SSE_for_memop FEATURE_INDEX_1 +# define index_Fast_Unaligned_Load FEATURE_INDEX_1 #define HAS_ARCH_FEATURE(idx, bit) \ ((__get_cpu_features ()->feature[idx] & (bit)) != 0) @@ -128,4 +131,7 @@ extern const struct cpu_features *__get_cpu_features (void) #define HAS_PREFER_SSE_FOR_MEMOP \ HAS_ARCH_FEATURE (index_Prefer_SSE_for_memop, bit_Prefer_SSE_for_memop) +#define HAS_FAST_UNALIGNED_LOAD \ + HAS_ARCH_FEATURE (index_Fast_Unaligned_Load, bit_Fast_Unaligned_Load) + #endif /* __ASSEMBLER__ */