.section .text.ssse3,"ax",@progbits
ENTRY (STRCPY)
+
mov %rsi, %rcx
# ifdef USE_AS_STRNCPY
mov %rdx, %r8
jz L(Exit0)
cmp $8, %r8
jbe L(StrncpyExit8Bytes)
-# endif
+# endif
cmpb $0, (%rcx)
jz L(Exit1)
cmpb $0, 1(%rcx)
jz L(Exit7)
cmpb $0, 7(%rcx)
jz L(Exit8)
-# ifdef USE_AS_STRNCPY
+# ifdef USE_AS_STRNCPY
cmp $16, %r8
jb L(StrncpyExit15Bytes)
-# endif
+# endif
cmpb $0, 8(%rcx)
jz L(Exit9)
cmpb $0, 9(%rcx)
jz L(Exit14)
cmpb $0, 14(%rcx)
jz L(Exit15)
-# ifdef USE_AS_STRNCPY
+# ifdef USE_AS_STRNCPY
cmp $16, %r8
je L(Exit16)
-# endif
+# endif
cmpb $0, 15(%rcx)
jz L(Exit16)
# endif
sub $16, %r8
and $0xf, %rsi
-/* add 16 bytes rcx_shift to r8 */
+/* add 16 bytes rcx_offset to r8 */
+
add %rsi, %r8
# endif
lea 16(%rcx), %rsi
-/* Now:
- rsi = alignment_16(rcx) + rcx_shift + 16;
- rcx_shift = rcx - alignment_16(rcx)
-*/
and $-16, %rsi
-/* Now:
- rsi = alignment_16(rcx) + 16
-*/
pxor %xmm0, %xmm0
mov (%rcx), %r9
mov %r9, (%rdx)
-/*
- look if there is zero symbol in next 16 bytes of string
- from rsi to rsi + 15 and form mask in xmm0
-*/
pcmpeqb (%rsi), %xmm0
mov 8(%rcx), %r9
mov %r9, 8(%rdx)
pmovmskb %xmm0, %rax
sub %rcx, %rsi
-/* rsi = 16 - rcx_shift */
-
-/* rax = 0: there isn't end of string from position rsi to rsi+15 */
-
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(CopyFrom1To16BytesCase2OrCase3)
mov %rdx, %rax
lea 16(%rdx), %rdx
-/* Now:
- rdx = rdx + 16 = alignment_16(rdx) + rdx_shift + 16
-*/
and $-16, %rdx
-
-/* Now: rdx = alignment_16(rdx) + 16 */
-
sub %rdx, %rax
-/* Now: rax = rdx_shift - 16 */
-
# ifdef USE_AS_STRNCPY
add %rax, %rsi
lea -1(%rsi), %rsi
L(ContinueCopy):
# endif
sub %rax, %rcx
-/* Now:
- case rcx_shift >= rdx_shift:
- rcx = alignment_16(rcx) + (rcx_shift - rdx_shift) + 16
- case rcx_shift < rdx_shift:
- rcx = alignment_16(rcx) + (16 + rcx_shift - rdx_shift)
-*/
mov %rcx, %rax
and $0xf, %rax
-/* Now:
- case rcx_shift >= rdx_shift: rax = rcx_shift - rdx_shift
- case rcx_shift < rdx_shift: rax = (16 + rcx_shift - rdx_shift)
- rax can be 0, 1, ..., 15
-*/
mov $0, %rsi
-/* case: rcx_shift == rdx_shift */
+/* case: rcx_offset == rdx_offset */
jz L(Align16Both)
sub %rcx, %rax
sub %rax, %rdx
# ifdef USE_AS_STRNCPY
- lea 48+64(%r8, %rax), %r8
+ lea 112(%r8, %rax), %r8
# endif
mov $-0x40, %rsi
+ .p2align 4
L(Aligned64Loop):
movaps (%rcx), %xmm2
movaps %xmm2, %xmm4
jnz L(Shl1LoopExit)
palignr $1, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 31(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit1Case2OrCase3)
test %rax, %rax
jnz L(Shl1LoopExit)
- palignr $1, %xmm1, %xmm2
+ palignr $1, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 31(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
jnz L(Shl1LoopExit)
palignr $1, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 31(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit1Case2OrCase3)
test %rax, %rax
jnz L(Shl1LoopExit)
- palignr $1, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $1, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 31(%rcx), %rcx
lea 16(%rdx), %rdx
# endif
movaps -1(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl1LoopStart):
movaps 15(%rcx), %xmm2
movaps 31(%rcx), %xmm3
jmp L(Shl1LoopStart)
L(Shl1LoopExit):
- movaps (%rdx), %xmm6
- psrldq $15, %xmm6
+ movdqu -1(%rcx), %xmm1
mov $15, %rsi
- palignr $1, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ movdqu %xmm1, -1(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
jnz L(Shl2LoopExit)
palignr $2, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 30(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit2Case2OrCase3)
test %rax, %rax
jnz L(Shl2LoopExit)
- palignr $2, %xmm1, %xmm2
+ palignr $2, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 30(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
jnz L(Shl2LoopExit)
palignr $2, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 30(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit2Case2OrCase3)
test %rax, %rax
jnz L(Shl2LoopExit)
- palignr $2, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $2, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 30(%rcx), %rcx
lea 16(%rdx), %rdx
# endif
movaps -2(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl2LoopStart):
movaps 14(%rcx), %xmm2
movaps 30(%rcx), %xmm3
jmp L(Shl2LoopStart)
L(Shl2LoopExit):
- movaps (%rdx), %xmm6
- psrldq $14, %xmm6
+ movdqu -2(%rcx), %xmm1
mov $14, %rsi
- palignr $2, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ movdqu %xmm1, -2(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
jnz L(Shl3LoopExit)
palignr $3, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 29(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit3Case2OrCase3)
test %rax, %rax
jnz L(Shl3LoopExit)
- palignr $3, %xmm1, %xmm2
+ palignr $3, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 29(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
jnz L(Shl3LoopExit)
palignr $3, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 29(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit3Case2OrCase3)
test %rax, %rax
jnz L(Shl3LoopExit)
- palignr $3, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $3, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 29(%rcx), %rcx
lea 16(%rdx), %rdx
# endif
movaps -3(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl3LoopStart):
movaps 13(%rcx), %xmm2
movaps 29(%rcx), %xmm3
jmp L(Shl3LoopStart)
L(Shl3LoopExit):
- movaps (%rdx), %xmm6
- psrldq $13, %xmm6
+ movdqu -3(%rcx), %xmm1
mov $13, %rsi
- palignr $3, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ movdqu %xmm1, -3(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
jnz L(Shl4LoopExit)
palignr $4, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 28(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit4Case2OrCase3)
test %rax, %rax
jnz L(Shl4LoopExit)
- palignr $4, %xmm1, %xmm2
+ palignr $4, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 28(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
jnz L(Shl4LoopExit)
palignr $4, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 28(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit4Case2OrCase3)
test %rax, %rax
jnz L(Shl4LoopExit)
- palignr $4, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $4, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 28(%rcx), %rcx
lea 16(%rdx), %rdx
# endif
movaps -4(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl4LoopStart):
movaps 12(%rcx), %xmm2
movaps 28(%rcx), %xmm3
jmp L(Shl4LoopStart)
L(Shl4LoopExit):
- movaps (%rdx), %xmm6
- psrldq $12, %xmm6
+ movdqu -4(%rcx), %xmm1
mov $12, %rsi
- palignr $4, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ movdqu %xmm1, -4(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
jnz L(Shl5LoopExit)
palignr $5, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 27(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit5Case2OrCase3)
test %rax, %rax
jnz L(Shl5LoopExit)
- palignr $5, %xmm1, %xmm2
+ palignr $5, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 27(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
jnz L(Shl5LoopExit)
palignr $5, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 27(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit5Case2OrCase3)
test %rax, %rax
jnz L(Shl5LoopExit)
- palignr $5, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $5, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 27(%rcx), %rcx
lea 16(%rdx), %rdx
# endif
movaps -5(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl5LoopStart):
movaps 11(%rcx), %xmm2
movaps 27(%rcx), %xmm3
jmp L(Shl5LoopStart)
L(Shl5LoopExit):
- movaps (%rdx), %xmm6
- psrldq $11, %xmm6
+ movdqu -5(%rcx), %xmm1
mov $11, %rsi
- palignr $5, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ movdqu %xmm1, -5(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
jnz L(Shl6LoopExit)
palignr $6, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 26(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit6Case2OrCase3)
test %rax, %rax
jnz L(Shl6LoopExit)
- palignr $6, %xmm1, %xmm2
+ palignr $6, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 26(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
jnz L(Shl6LoopExit)
palignr $6, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 26(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit6Case2OrCase3)
test %rax, %rax
jnz L(Shl6LoopExit)
- palignr $6, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $6, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 26(%rcx), %rcx
lea 16(%rdx), %rdx
# endif
movaps -6(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl6LoopStart):
movaps 10(%rcx), %xmm2
movaps 26(%rcx), %xmm3
jmp L(Shl6LoopStart)
L(Shl6LoopExit):
- movaps (%rdx), %xmm6
- psrldq $10, %xmm6
+ mov (%rcx), %r9
+ mov 6(%rcx), %esi
+ mov %r9, (%rdx)
+ mov %esi, 6(%rdx)
mov $10, %rsi
- palignr $6, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
jnz L(Shl7LoopExit)
palignr $7, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 25(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit7Case2OrCase3)
test %rax, %rax
jnz L(Shl7LoopExit)
- palignr $7, %xmm1, %xmm2
+ palignr $7, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 25(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
jnz L(Shl7LoopExit)
palignr $7, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 25(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit7Case2OrCase3)
test %rax, %rax
jnz L(Shl7LoopExit)
- palignr $7, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $7, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 25(%rcx), %rcx
lea 16(%rdx), %rdx
# endif
movaps -7(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl7LoopStart):
movaps 9(%rcx), %xmm2
movaps 25(%rcx), %xmm3
jmp L(Shl7LoopStart)
L(Shl7LoopExit):
- movaps (%rdx), %xmm6
- psrldq $9, %xmm6
+ mov (%rcx), %r9
+ mov 5(%rcx), %esi
+ mov %r9, (%rdx)
+ mov %esi, 5(%rdx)
mov $9, %rsi
- palignr $7, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
jnz L(Shl8LoopExit)
palignr $8, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 24(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit8Case2OrCase3)
test %rax, %rax
jnz L(Shl8LoopExit)
- palignr $8, %xmm1, %xmm2
+ palignr $8, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 24(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
jnz L(Shl8LoopExit)
palignr $8, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 24(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit8Case2OrCase3)
test %rax, %rax
jnz L(Shl8LoopExit)
- palignr $8, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $8, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 24(%rcx), %rcx
lea 16(%rdx), %rdx
# endif
movaps -8(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl8LoopStart):
movaps 8(%rcx), %xmm2
movaps 24(%rcx), %xmm3
jmp L(Shl8LoopStart)
L(Shl8LoopExit):
- movaps (%rdx), %xmm6
- psrldq $8, %xmm6
+ mov (%rcx), %r9
mov $8, %rsi
- palignr $8, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9, (%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
jnz L(Shl9LoopExit)
palignr $9, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 23(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit9Case2OrCase3)
test %rax, %rax
jnz L(Shl9LoopExit)
- palignr $9, %xmm1, %xmm2
+ palignr $9, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 23(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
jnz L(Shl9LoopExit)
palignr $9, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 23(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit9Case2OrCase3)
test %rax, %rax
jnz L(Shl9LoopExit)
- palignr $9, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $9, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 23(%rcx), %rcx
lea 16(%rdx), %rdx
# endif
movaps -9(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl9LoopStart):
movaps 7(%rcx), %xmm2
movaps 23(%rcx), %xmm3
jmp L(Shl9LoopStart)
L(Shl9LoopExit):
- movaps (%rdx), %xmm6
- psrldq $7, %xmm6
+ mov -1(%rcx), %r9
mov $7, %rsi
- palignr $9, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9, -1(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
jnz L(Shl10LoopExit)
palignr $10, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 22(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit10Case2OrCase3)
test %rax, %rax
jnz L(Shl10LoopExit)
- palignr $10, %xmm1, %xmm2
+ palignr $10, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 22(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
jnz L(Shl10LoopExit)
palignr $10, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 22(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit10Case2OrCase3)
test %rax, %rax
jnz L(Shl10LoopExit)
- palignr $10, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $10, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 22(%rcx), %rcx
lea 16(%rdx), %rdx
# endif
movaps -10(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl10LoopStart):
movaps 6(%rcx), %xmm2
movaps 22(%rcx), %xmm3
jmp L(Shl10LoopStart)
L(Shl10LoopExit):
- movaps (%rdx), %xmm6
- psrldq $6, %xmm6
+ mov -2(%rcx), %r9
mov $6, %rsi
- palignr $10, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9, -2(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
jnz L(Shl11LoopExit)
palignr $11, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 21(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit11Case2OrCase3)
test %rax, %rax
jnz L(Shl11LoopExit)
- palignr $11, %xmm1, %xmm2
+ palignr $11, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 21(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
jnz L(Shl11LoopExit)
palignr $11, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 21(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit11Case2OrCase3)
test %rax, %rax
jnz L(Shl11LoopExit)
- palignr $11, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $11, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 21(%rcx), %rcx
lea 16(%rdx), %rdx
# endif
movaps -11(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl11LoopStart):
movaps 5(%rcx), %xmm2
movaps 21(%rcx), %xmm3
jmp L(Shl11LoopStart)
L(Shl11LoopExit):
- movaps (%rdx), %xmm6
- psrldq $5, %xmm6
+ mov -3(%rcx), %r9
mov $5, %rsi
- palignr $11, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9, -3(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
jnz L(Shl12LoopExit)
palignr $12, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 20(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit12Case2OrCase3)
test %rax, %rax
jnz L(Shl12LoopExit)
- palignr $12, %xmm1, %xmm2
+ palignr $12, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 20(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
jnz L(Shl12LoopExit)
palignr $12, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 20(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit12Case2OrCase3)
test %rax, %rax
jnz L(Shl12LoopExit)
- palignr $12, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $12, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 20(%rcx), %rcx
lea 16(%rdx), %rdx
# endif
movaps -12(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl12LoopStart):
movaps 4(%rcx), %xmm2
movaps 20(%rcx), %xmm3
jmp L(Shl12LoopStart)
L(Shl12LoopExit):
- movaps (%rdx), %xmm6
- psrldq $4, %xmm6
+ mov (%rcx), %r9d
mov $4, %rsi
- palignr $12, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9d, (%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
jnz L(Shl13LoopExit)
palignr $13, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 19(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit13Case2OrCase3)
test %rax, %rax
jnz L(Shl13LoopExit)
- palignr $13, %xmm1, %xmm2
+ palignr $13, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 19(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
jnz L(Shl13LoopExit)
palignr $13, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 19(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit13Case2OrCase3)
test %rax, %rax
jnz L(Shl13LoopExit)
- palignr $13, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $13, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 19(%rcx), %rcx
lea 16(%rdx), %rdx
# endif
movaps -13(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl13LoopStart):
movaps 3(%rcx), %xmm2
movaps 19(%rcx), %xmm3
jmp L(Shl13LoopStart)
L(Shl13LoopExit):
- movaps (%rdx), %xmm6
- psrldq $3, %xmm6
+ mov -1(%rcx), %r9d
mov $3, %rsi
- palignr $13, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9d, -1(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
jnz L(Shl14LoopExit)
palignr $14, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 18(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit14Case2OrCase3)
test %rax, %rax
jnz L(Shl14LoopExit)
- palignr $14, %xmm1, %xmm2
+ palignr $14, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 18(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
jnz L(Shl14LoopExit)
palignr $14, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 18(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit14Case2OrCase3)
test %rax, %rax
jnz L(Shl14LoopExit)
- palignr $14, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $14, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 18(%rcx), %rcx
lea 16(%rdx), %rdx
# endif
movaps -14(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl14LoopStart):
movaps 2(%rcx), %xmm2
movaps 18(%rcx), %xmm3
jmp L(Shl14LoopStart)
L(Shl14LoopExit):
- movaps (%rdx), %xmm6
- psrldq $2, %xmm6
+ mov -2(%rcx), %r9d
mov $2, %rsi
- palignr $14, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9d, -2(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
jnz L(Shl15LoopExit)
palignr $15, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 17(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit15Case2OrCase3)
test %rax, %rax
jnz L(Shl15LoopExit)
- palignr $15, %xmm1, %xmm2
+ palignr $15, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 17(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
jnz L(Shl15LoopExit)
palignr $15, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 17(%rcx), %xmm2
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit15Case2OrCase3)
test %rax, %rax
jnz L(Shl15LoopExit)
- palignr $15, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $15, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 17(%rcx), %rcx
lea 16(%rdx), %rdx
# endif
movaps -15(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl15LoopStart):
movaps 1(%rcx), %xmm2
movaps 17(%rcx), %xmm3
jmp L(Shl15LoopStart)
L(Shl15LoopExit):
- movaps (%rdx), %xmm6
- psrldq $1, %xmm6
+ mov -3(%rcx), %r9d
mov $1, %rsi
- palignr $15, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9d, -3(%rdx)
# ifdef USE_AS_STRCAT
jmp L(CopyFrom1To16Bytes)
# endif
# ifndef USE_AS_STRCAT
+
.p2align 4
L(CopyFrom1To16Bytes):
# ifdef USE_AS_STRNCPY
# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
+# endif
# endif
ret
# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
+# endif
# endif
ret
# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
+# endif
# endif
ret
# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
+# endif
# endif
ret
ret
# endif
-
# endif
# ifdef USE_AS_STRNCPY
-
+ .p2align 4
L(StrncpyLeaveCase2OrCase3):
test %rax, %rax
jnz L(Aligned64LeaveCase2)
lea -16(%r8), %r8
jmp L(CopyFrom1To16BytesCase2)
/*--------------------------------------------------*/
+ .p2align 4
L(StrncpyExit1Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $15, %xmm6
+ movdqu -1(%rcx), %xmm0
+ movdqu %xmm0, -1(%rdx)
mov $15, %rsi
- palignr $1, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit2Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $14, %xmm6
+ movdqu -2(%rcx), %xmm0
+ movdqu %xmm0, -2(%rdx)
mov $14, %rsi
- palignr $2, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit3Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $13, %xmm6
+ movdqu -3(%rcx), %xmm0
+ movdqu %xmm0, -3(%rdx)
mov $13, %rsi
- palignr $3, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit4Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $12, %xmm6
+ movdqu -4(%rcx), %xmm0
+ movdqu %xmm0, -4(%rdx)
mov $12, %rsi
- palignr $4, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit5Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $11, %xmm6
+ movdqu -5(%rcx), %xmm0
+ movdqu %xmm0, -5(%rdx)
mov $11, %rsi
- palignr $5, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit6Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $10, %xmm6
- mov $10, %rsi
- palignr $6, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov (%rcx), %rsi
+ mov 6(%rcx), %r9d
+ mov %r9d, 6(%rdx)
+ mov %rsi, (%rdx)
test %rax, %rax
+ mov $10, %rsi
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit7Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $9, %xmm6
- mov $9, %rsi
- palignr $7, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov (%rcx), %rsi
+ mov 5(%rcx), %r9d
+ mov %r9d, 5(%rdx)
+ mov %rsi, (%rdx)
test %rax, %rax
+ mov $9, %rsi
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit8Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $8, %xmm6
+ mov (%rcx), %r9
mov $8, %rsi
- palignr $8, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9, (%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit9Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $7, %xmm6
+ mov -1(%rcx), %r9
mov $7, %rsi
- palignr $9, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9, -1(%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit10Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $6, %xmm6
+ mov -2(%rcx), %r9
mov $6, %rsi
- palignr $10, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9, -2(%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit11Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $5, %xmm6
+ mov -3(%rcx), %r9
mov $5, %rsi
- palignr $11, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9, -3(%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit12Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $4, %xmm6
+ mov (%rcx), %r9d
mov $4, %rsi
- palignr $12, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9d, (%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit13Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $3, %xmm6
+ mov -1(%rcx), %r9d
mov $3, %rsi
- palignr $13, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9d, -1(%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit14Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $2, %xmm6
+ mov -2(%rcx), %r9d
mov $2, %rsi
- palignr $14, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9d, -2(%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit15Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $1, %xmm6
+ mov -3(%rcx), %r9d
mov $1, %rsi
- palignr $15, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9d, -3(%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave1):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit1)
palignr $1, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 31(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit1)
- palignr $1, %xmm1, %xmm2
+ palignr $1, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 31+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit1)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit1)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit1):
- movaps (%rdx, %rsi), %xmm6
- psrldq $15, %xmm6
- palignr $1, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 15(%rsi), %rsi
+ lea 15(%rdx, %rsi), %rdx
+ lea 15(%rcx, %rsi), %rcx
+ mov -15(%rcx), %rsi
+ mov -8(%rcx), %rax
+ mov %rsi, -15(%rdx)
+ mov %rax, -8(%rdx)
+ xor %rsi, %rsi
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave2):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit2)
palignr $2, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 30(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit2)
- palignr $2, %xmm1, %xmm2
+ palignr $2, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 30+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit2)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit2)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit2):
- movaps (%rdx, %rsi), %xmm6
- psrldq $14, %xmm6
- palignr $2, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 14(%rsi), %rsi
+ lea 14(%rdx, %rsi), %rdx
+ lea 14(%rcx, %rsi), %rcx
+ mov -14(%rcx), %rsi
+ mov -8(%rcx), %rax
+ mov %rsi, -14(%rdx)
+ mov %rax, -8(%rdx)
+ xor %rsi, %rsi
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave3):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit3)
palignr $3, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 29(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit3)
- palignr $3, %xmm1, %xmm2
+ palignr $3, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 29+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit3)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit3)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit3):
- movaps (%rdx, %rsi), %xmm6
- psrldq $13, %xmm6
- palignr $3, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 13(%rsi), %rsi
+ lea 13(%rdx, %rsi), %rdx
+ lea 13(%rcx, %rsi), %rcx
+ mov -13(%rcx), %rsi
+ mov -8(%rcx), %rax
+ mov %rsi, -13(%rdx)
+ mov %rax, -8(%rdx)
+ xor %rsi, %rsi
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave4):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit4)
palignr $4, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 28(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit4)
- palignr $4, %xmm1, %xmm2
+ palignr $4, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 28+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit4)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit4)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit4):
- movaps (%rdx, %rsi), %xmm6
- psrldq $12, %xmm6
- palignr $4, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 12(%rsi), %rsi
+ lea 12(%rdx, %rsi), %rdx
+ lea 12(%rcx, %rsi), %rcx
+ mov -12(%rcx), %rsi
+ mov -4(%rcx), %eax
+ mov %rsi, -12(%rdx)
+ mov %eax, -4(%rdx)
+ xor %rsi, %rsi
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave5):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit5)
palignr $5, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 27(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit5)
- palignr $5, %xmm1, %xmm2
+ palignr $5, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 27+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit5)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit5)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit5):
- movaps (%rdx, %rsi), %xmm6
- psrldq $11, %xmm6
- palignr $5, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 11(%rsi), %rsi
+ lea 11(%rdx, %rsi), %rdx
+ lea 11(%rcx, %rsi), %rcx
+ mov -11(%rcx), %rsi
+ mov -4(%rcx), %eax
+ mov %rsi, -11(%rdx)
+ mov %eax, -4(%rdx)
+ xor %rsi, %rsi
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave6):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit6)
palignr $6, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 26(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit6)
- palignr $6, %xmm1, %xmm2
+ palignr $6, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 26+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit6)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit6)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit6):
- movaps (%rdx, %rsi), %xmm6
- psrldq $10, %xmm6
- palignr $6, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 10(%rsi), %rsi
+ lea 10(%rdx, %rsi), %rdx
+ lea 10(%rcx, %rsi), %rcx
+ mov -10(%rcx), %rsi
+ movw -2(%rcx), %ax
+ mov %rsi, -10(%rdx)
+ movw %ax, -2(%rdx)
+ xor %rsi, %rsi
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave7):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit7)
palignr $7, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 25(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit7)
- palignr $7, %xmm1, %xmm2
+ palignr $7, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 25+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit7)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit7)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit7):
- movaps (%rdx, %rsi), %xmm6
- psrldq $9, %xmm6
- palignr $7, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 9(%rsi), %rsi
+ lea 9(%rdx, %rsi), %rdx
+ lea 9(%rcx, %rsi), %rcx
+ mov -9(%rcx), %rsi
+ movb -1(%rcx), %ah
+ mov %rsi, -9(%rdx)
+ movb %ah, -1(%rdx)
+ xor %rsi, %rsi
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave8):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit8)
palignr $8, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 24(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit8)
- palignr $8, %xmm1, %xmm2
+ palignr $8, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 24+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit8)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit8)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit8):
- movaps (%rdx, %rsi), %xmm6
- psrldq $8, %xmm6
- palignr $8, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 8(%rsi), %rsi
+ lea 8(%rdx, %rsi), %rdx
+ lea 8(%rcx, %rsi), %rcx
+ mov -8(%rcx), %rax
+ xor %rsi, %rsi
+ mov %rax, -8(%rdx)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave9):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit9)
palignr $9, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 23(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit9)
- palignr $9, %xmm1, %xmm2
+ palignr $9, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 23+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit9)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit9)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit9):
- movaps (%rdx, %rsi), %xmm6
- psrldq $7, %xmm6
- palignr $9, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 7(%rsi), %rsi
+ lea 7(%rdx, %rsi), %rdx
+ lea 7(%rcx, %rsi), %rcx
+ mov -8(%rcx), %rax
+ xor %rsi, %rsi
+ mov %rax, -8(%rdx)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave10):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit10)
palignr $10, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 22(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit10)
- palignr $10, %xmm1, %xmm2
+ palignr $10, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 22+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit10)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit10)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit10):
- movaps (%rdx, %rsi), %xmm6
- psrldq $6, %xmm6
- palignr $10, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 6(%rsi), %rsi
+ lea 6(%rdx, %rsi), %rdx
+ lea 6(%rcx, %rsi), %rcx
+ mov -8(%rcx), %rax
+ xor %rsi, %rsi
+ mov %rax, -8(%rdx)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave11):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit11)
palignr $11, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 21(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit11)
- palignr $11, %xmm1, %xmm2
+ palignr $11, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 21+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit11)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit11)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit11):
- movaps (%rdx, %rsi), %xmm6
- psrldq $5, %xmm6
- palignr $11, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 5(%rsi), %rsi
+ lea 5(%rdx, %rsi), %rdx
+ lea 5(%rcx, %rsi), %rcx
+ mov -8(%rcx), %rax
+ xor %rsi, %rsi
+ mov %rax, -8(%rdx)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave12):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit12)
palignr $12, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 20(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit12)
- palignr $12, %xmm1, %xmm2
+ palignr $12, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 20+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit12)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit12)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit12):
- movaps (%rdx, %rsi), %xmm6
- psrldq $4, %xmm6
- palignr $12, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 4(%rsi), %rsi
+ lea 4(%rdx, %rsi), %rdx
+ lea 4(%rcx, %rsi), %rcx
+ mov -4(%rcx), %eax
+ xor %rsi, %rsi
+ mov %eax, -4(%rdx)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave13):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit13)
palignr $13, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 19(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit13)
- palignr $13, %xmm1, %xmm2
+ palignr $13, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 19+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit13)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit13)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit13):
- movaps (%rdx, %rsi), %xmm6
- psrldq $3, %xmm6
- palignr $13, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 3(%rsi), %rsi
+ lea 3(%rdx, %rsi), %rdx
+ lea 3(%rcx, %rsi), %rcx
+ mov -4(%rcx), %eax
+ xor %rsi, %rsi
+ mov %eax, -4(%rdx)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave14):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit14)
palignr $14, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 18(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit14)
- palignr $14, %xmm1, %xmm2
+ palignr $14, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 18+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit14)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit14)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit14):
- movaps (%rdx, %rsi), %xmm6
- psrldq $2, %xmm6
- palignr $14, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 2(%rsi), %rsi
+ lea 2(%rdx, %rsi), %rdx
+ lea 2(%rcx, %rsi), %rcx
+ movw -2(%rcx), %ax
+ xor %rsi, %rsi
+ movw %ax, -2(%rdx)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave15):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit15)
palignr $15, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 17(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit15)
- palignr $15, %xmm1, %xmm2
+ palignr $15, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 17+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit15)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit15)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit15):
- movaps (%rdx, %rsi), %xmm6
- psrldq $1, %xmm6
- palignr $15, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 1(%rsi), %rsi
+ lea 1(%rdx, %rsi), %rdx
+ lea 1(%rcx, %rsi), %rcx
+ movb -1(%rcx), %ah
+ xor %rsi, %rsi
+ movb %ah, -1(%rdx)
jmp L(CopyFrom1To16BytesCase3)
+
# endif
# ifndef USE_AS_STRCAT
END (STRCPY)