2 Copyright (C) 2009, 2010 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
22 #include <init-arch.h>
25 /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
26 if the new counter > the old one or is 0. */
27 # define UPDATE_STRNCMP_COUNTER \
28 /* calculate left number to compare */ \
29 lea -16(%rcx, %r11), %r9; \
31 jb LABEL(strcmp_exitz_sse4_2); \
33 je LABEL(strcmp_exitz_sse4_2); \
36 # define STRCMP_SSE42 __strncmp_sse42
37 # define STRCMP_SSSE3 __strncmp_ssse3
38 # define STRCMP_SSE2 __strncmp_sse2
39 # define __GI_STRCMP __GI_strncmp
40 #elif defined USE_AS_STRCASECMP_L
41 # include "locale-defines.h"
43 # define UPDATE_STRNCMP_COUNTER
45 # define STRCMP_SSE42 __strcasecmp_l_sse42
46 # define STRCMP_SSSE3 __strcasecmp_l_ssse3
47 # define STRCMP_SSE2 __strcasecmp_l_sse2
48 # define __GI_STRCMP __GI___strcasecmp_l
49 #elif defined USE_AS_STRNCASECMP_L
50 # include "locale-defines.h"
52 /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
53 if the new counter > the old one or is 0. */
54 # define UPDATE_STRNCMP_COUNTER \
55 /* calculate left number to compare */ \
56 lea -16(%rcx, %r11), %r9; \
58 jb LABEL(strcmp_exitz_sse4_2); \
60 je LABEL(strcmp_exitz_sse4_2); \
63 # define STRCMP_SSE42 __strncasecmp_l_sse42
64 # define STRCMP_SSSE3 __strncasecmp_l_ssse3
65 # define STRCMP_SSE2 __strncasecmp_l_sse2
66 # define __GI_STRCMP __GI___strncasecmp_l
68 # define UPDATE_STRNCMP_COUNTER
70 # define STRCMP strcmp
71 # define STRCMP_SSE42 __strcmp_sse42
72 # define STRCMP_SSSE3 __strcmp_ssse3
73 # define STRCMP_SSE2 __strcmp_sse2
74 # define __GI_STRCMP __GI_strcmp
79 # define LABEL(l) L(l)
82 /* Define multiple versions only for the definition in libc. Don't
83 define multiple versions for strncmp in static library since we
84 need strncmp before the initialization happened. */
85 #if (defined SHARED || !defined USE_AS_STRNCMP) && !defined NOT_IN_libc
88 .type STRCMP, @gnu_indirect_function
89 cmpl $0, __cpu_features+KIND_OFFSET(%rip)
91 call __init_cpu_features
93 leaq STRCMP_SSE42(%rip), %rax
94 testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
96 leaq STRCMP_SSSE3(%rip), %rax
97 testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
99 leaq STRCMP_SSE2(%rip), %rax
103 # ifdef USE_AS_STRCASECMP_L
105 .type __strcasecmp, @gnu_indirect_function
106 cmpl $0, __cpu_features+KIND_OFFSET(%rip)
108 call __init_cpu_features
110 leaq __strcasecmp_sse42(%rip), %rax
111 testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
113 leaq __strcasecmp_ssse3(%rip), %rax
114 testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
116 leaq __strcasecmp_sse2(%rip), %rax
119 weak_alias (__strcasecmp, strcasecmp)
121 # ifdef USE_AS_STRNCASECMP_L
123 .type __strncasecmp, @gnu_indirect_function
124 cmpl $0, __cpu_features+KIND_OFFSET(%rip)
126 call __init_cpu_features
128 leaq __strncasecmp_sse42(%rip), %rax
129 testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
131 leaq __strncasecmp_ssse3(%rip), %rax
132 testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
134 leaq __strncasecmp_sse2(%rip), %rax
137 weak_alias (__strncasecmp, strncasecmp)
142 | _SIDD_CMP_EQUAL_EACH
143 | _SIDD_NEGATIVE_POLARITY
144 | _SIDD_LEAST_SIGNIFICANT
145 on pcmpistri to find out if two 16byte data elements are the same
146 and the offset of the first different byte. There are 4 cases:
148 1. Both 16byte data elements are valid and identical.
149 2. Both 16byte data elements have EOS and identical.
150 3. Both 16byte data elements are valid and they differ at offset X.
151 4. At least one 16byte data element has EOS at offset X. Two 16byte
152 data elements must differ at or before offset X.
154 Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
156 case ECX CFlag ZFlag SFlag
162 We exit from the loop for cases 2, 3 and 4 with jbe which branches
163 when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for
166 /* Put all SSE 4.2 functions together. */
167 .section .text.sse4.2,"ax",@progbits
169 .type STRCMP_SSE42, @function
170 # ifdef USE_AS_STRCASECMP_L
171 ENTRY (__strcasecmp_sse42)
172 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
175 // XXX 5 byte should be before the function
177 .byte 0x0f,0x1f,0x44,0x00,0x00
178 END (__strcasecmp_sse42)
179 /* FALLTHROUGH to strcasecmp_l. */
181 # ifdef USE_AS_STRNCASECMP_L
182 ENTRY (__strncasecmp_sse42)
183 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
186 // XXX 5 byte should be before the function
188 .byte 0x0f,0x1f,0x44,0x00,0x00
189 END (__strncasecmp_sse42)
190 /* FALLTHROUGH to strncasecmp_l. */
198 * This implementation uses SSE to compare up to 16 bytes at a time.
200 # ifdef USE_AS_STRCASECMP_L
201 /* We have to fall back on the C implementation for locales
202 with encodings not matching ASCII for single bytes. */
203 # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
204 movq LOCALE_T___LOCALES+LC_CTYPE*8(%rdx), %rax
208 testl $0, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
209 jne __strcasecmp_l_nonascii
211 # ifdef USE_AS_STRNCASECMP_L
212 /* We have to fall back on the C implementation for locales
213 with encodings not matching ASCII for single bytes. */
214 # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
215 movq LOCALE_T___LOCALES+LC_CTYPE*8(%rcx), %rax
219 testl $0, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
220 jne __strncasecmp_l_nonascii
223 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
225 je LABEL(strcmp_exitz_sse4_2)
227 je LABEL(Byte0_sse4_2)
232 /* Use 64bit AND here to avoid long NOP padding. */
233 and $0x3f, %rcx /* rsi alignment in cache line */
234 and $0x3f, %rax /* rdi alignment in cache line */
235 # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
236 .section .rodata.cst16,"aM",@progbits,16
239 .quad 0x4040404040404040
240 .quad 0x4040404040404040
242 .quad 0x5b5b5b5b5b5b5b5b
243 .quad 0x5b5b5b5b5b5b5b5b
245 .quad 0x2020202020202020
246 .quad 0x2020202020202020
248 movdqa .Lbelowupper_sse4(%rip), %xmm4
249 # define UCLOW_reg %xmm4
250 movdqa .Ltopupper_sse4(%rip), %xmm5
251 # define UCHIGH_reg %xmm5
252 movdqa .Ltouppermask_sse4(%rip), %xmm6
253 # define LCQWORD_reg %xmm6
256 ja LABEL(crosscache_sse4_2)/* rsi: 16-byte load will cross cache line */
258 ja LABEL(crosscache_sse4_2)/* rdi: 16-byte load will cross cache line */
261 # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
262 # define TOLOWER(reg1, reg2) \
263 movdqa reg1, %xmm7; \
264 movdqa UCHIGH_reg, %xmm8; \
265 movdqa reg2, %xmm9; \
266 movdqa UCHIGH_reg, %xmm10; \
267 pcmpgtb UCLOW_reg, %xmm7; \
268 pcmpgtb reg1, %xmm8; \
269 pcmpgtb UCLOW_reg, %xmm9; \
270 pcmpgtb reg2, %xmm10; \
272 pand %xmm10, %xmm9; \
273 pand LCQWORD_reg, %xmm7; \
274 pand LCQWORD_reg, %xmm9; \
277 TOLOWER (%xmm1, %xmm2)
279 # define TOLOWER(reg1, reg2)
281 pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
282 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
283 pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
284 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
286 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
287 jnz LABEL(less16bytes_sse4_2)/* If not, find different value or null char */
288 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
290 jbe LABEL(strcmp_exitz_sse4_2)/* finish comparision */
292 add $16, %rsi /* prepare to search next 16 bytes */
293 add $16, %rdi /* prepare to search next 16 bytes */
296 * Determine source and destination string offsets from 16-byte alignment.
297 * Use relative offset difference between the two to determine which case
301 LABEL(crosscache_sse4_2):
302 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
303 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
304 mov $0xffff, %edx /* for equivalent offset */
306 and $0xf, %ecx /* offset of rsi */
307 and $0xf, %eax /* offset of rdi */
309 je LABEL(ashr_0_sse4_2) /* rsi and rdi relative offset same */
310 ja LABEL(bigger_sse4_2)
311 mov %edx, %r8d /* r8d is offset flag for exit tail */
314 LABEL(bigger_sse4_2):
317 lea LABEL(unaligned_table_sse4_2)(%rip), %r10
318 movslq (%r10, %r9,4), %r9
319 lea (%r10, %r9), %r10
320 jmp *%r10 /* jump to corresponding case */
323 * The following cases will be handled by ashr_0
324 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
325 * n(0~15) n(0~15) 15(15+ n-n) ashr_0
328 LABEL(ashr_0_sse4_2):
331 pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
332 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
333 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
334 pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
337 TOLOWER (%xmm1, %xmm2)
338 pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */
340 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
342 shr %cl, %edx /* adjust 0xffff for offset */
343 shr %cl, %r9d /* adjust for 16-byte offset */
346 * edx must be the same with r9d if in left byte (16-rcx) is equal to
347 * the start from (16-rax) and no null char was seen.
349 jne LABEL(less32bytes_sse4_2) /* mismatch or null char */
350 UPDATE_STRNCMP_COUNTER
353 pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */
356 * Now both strings are aligned at 16-byte boundary. Loop over strings
357 * checking 32-bytes per iteration.
359 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
361 LABEL(ashr_0_use_sse4_2):
362 movdqa (%rdi,%rdx), %xmm0
363 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
364 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
366 movdqa (%rsi,%rdx), %xmm1
367 TOLOWER (%xmm0, %xmm1)
368 pcmpistri $0x1a, %xmm1, %xmm0
371 jbe LABEL(ashr_0_use_sse4_2_exit)
372 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
374 jbe LABEL(strcmp_exitz_sse4_2)
377 movdqa (%rdi,%rdx), %xmm0
378 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
379 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
381 movdqa (%rsi,%rdx), %xmm1
382 TOLOWER (%xmm0, %xmm1)
383 pcmpistri $0x1a, %xmm1, %xmm0
386 jbe LABEL(ashr_0_use_sse4_2_exit)
387 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
389 jbe LABEL(strcmp_exitz_sse4_2)
391 jmp LABEL(ashr_0_use_sse4_2)
395 LABEL(ashr_0_use_sse4_2_exit):
396 jnc LABEL(strcmp_exitz_sse4_2)
397 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
399 jbe LABEL(strcmp_exitz_sse4_2)
401 lea -16(%rdx, %rcx), %rcx
402 movzbl (%rdi, %rcx), %eax
403 movzbl (%rsi, %rcx), %edx
404 # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
405 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
406 movl (%rcx,%rax,4), %eax
407 movl (%rcx,%rdx,4), %edx
415 * The following cases will be handled by ashr_1
416 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
417 * n(15) n -15 0(15 +(n-15) - n) ashr_1
420 LABEL(ashr_1_sse4_2):
424 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
425 pslldq $15, %xmm2 /* shift first string to align with second */
426 TOLOWER (%xmm1, %xmm2)
427 pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
428 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
430 shr %cl, %edx /* adjust 0xffff for offset */
431 shr %cl, %r9d /* adjust for 16-byte offset */
433 jnz LABEL(less32bytes_sse4_2)/* mismatch or null char seen */
435 UPDATE_STRNCMP_COUNTER
438 mov $16, %rcx /* index for loads*/
439 mov $1, %r9d /* byte position left over from less32bytes case */
441 * Setup %r10 value allows us to detect crossing a page boundary.
442 * When %r10 goes positive we have crossed a page boundary and
443 * need to do a nibble.
446 and $0xfff, %r10 /* offset into 4K page */
447 sub $0x1000, %r10 /* subtract 4K pagesize */
448 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
451 LABEL(loop_ashr_1_use_sse4_2):
453 jg LABEL(nibble_ashr_1_use_sse4_2)
455 movdqa (%rdi, %rdx), %xmm0
456 palignr $1, -16(%rdi, %rdx), %xmm0
457 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
458 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
460 movdqa (%rsi,%rdx), %xmm1
461 TOLOWER (%xmm0, %xmm1)
462 pcmpistri $0x1a, %xmm1, %xmm0
464 jbe LABEL(use_sse4_2_exit)
465 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
467 jbe LABEL(strcmp_exitz_sse4_2)
472 jg LABEL(nibble_ashr_1_use_sse4_2)
474 movdqa (%rdi, %rdx), %xmm0
475 palignr $1, -16(%rdi, %rdx), %xmm0
476 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
477 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
479 movdqa (%rsi,%rdx), %xmm1
480 TOLOWER (%xmm0, %xmm1)
481 pcmpistri $0x1a, %xmm1, %xmm0
483 jbe LABEL(use_sse4_2_exit)
484 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
486 jbe LABEL(strcmp_exitz_sse4_2)
489 jmp LABEL(loop_ashr_1_use_sse4_2)
492 LABEL(nibble_ashr_1_use_sse4_2):
494 movdqa -16(%rdi, %rdx), %xmm0
496 pcmpistri $0x3a,%xmm0, %xmm0
497 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
499 jae LABEL(nibble_ashr_use_sse4_2_exit)
502 ja LABEL(loop_ashr_1_use_sse4_2)
504 jmp LABEL(nibble_ashr_use_sse4_2_exit)
507 * The following cases will be handled by ashr_2
508 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
509 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
512 LABEL(ashr_2_sse4_2):
518 TOLOWER (%xmm1, %xmm2)
525 jnz LABEL(less32bytes_sse4_2)
527 UPDATE_STRNCMP_COUNTER
530 mov $16, %rcx /* index for loads */
531 mov $2, %r9d /* byte position left over from less32bytes case */
533 * Setup %r10 value allows us to detect crossing a page boundary.
534 * When %r10 goes positive we have crossed a page boundary and
535 * need to do a nibble.
538 and $0xfff, %r10 /* offset into 4K page */
539 sub $0x1000, %r10 /* subtract 4K pagesize */
540 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
543 LABEL(loop_ashr_2_use_sse4_2):
545 jg LABEL(nibble_ashr_2_use_sse4_2)
547 movdqa (%rdi, %rdx), %xmm0
548 palignr $2, -16(%rdi, %rdx), %xmm0
549 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
550 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
552 movdqa (%rsi,%rdx), %xmm1
553 TOLOWER (%xmm0, %xmm1)
554 pcmpistri $0x1a, %xmm1, %xmm0
556 jbe LABEL(use_sse4_2_exit)
557 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
559 jbe LABEL(strcmp_exitz_sse4_2)
564 jg LABEL(nibble_ashr_2_use_sse4_2)
566 movdqa (%rdi, %rdx), %xmm0
567 palignr $2, -16(%rdi, %rdx), %xmm0
568 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
569 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
571 movdqa (%rsi,%rdx), %xmm1
572 TOLOWER (%xmm0, %xmm1)
573 pcmpistri $0x1a, %xmm1, %xmm0
575 jbe LABEL(use_sse4_2_exit)
576 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
578 jbe LABEL(strcmp_exitz_sse4_2)
581 jmp LABEL(loop_ashr_2_use_sse4_2)
584 LABEL(nibble_ashr_2_use_sse4_2):
586 movdqa -16(%rdi, %rdx), %xmm0
588 pcmpistri $0x3a,%xmm0, %xmm0
589 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
591 jae LABEL(nibble_ashr_use_sse4_2_exit)
594 ja LABEL(loop_ashr_2_use_sse4_2)
596 jmp LABEL(nibble_ashr_use_sse4_2_exit)
599 * The following cases will be handled by ashr_3
600 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
601 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
604 LABEL(ashr_3_sse4_2):
610 TOLOWER (%xmm1, %xmm2)
617 jnz LABEL(less32bytes_sse4_2)
620 UPDATE_STRNCMP_COUNTER
623 mov $16, %rcx /* index for loads */
624 mov $3, %r9d /* byte position left over from less32bytes case */
626 * Setup %r10 value allows us to detect crossing a page boundary.
627 * When %r10 goes positive we have crossed a page boundary and
628 * need to do a nibble.
631 and $0xfff, %r10 /* offset into 4K page */
632 sub $0x1000, %r10 /* subtract 4K pagesize */
633 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
635 LABEL(loop_ashr_3_use_sse4_2):
637 jg LABEL(nibble_ashr_3_use_sse4_2)
639 movdqa (%rdi, %rdx), %xmm0
640 palignr $3, -16(%rdi, %rdx), %xmm0
641 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
642 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
644 movdqa (%rsi,%rdx), %xmm1
645 TOLOWER (%xmm0, %xmm1)
646 pcmpistri $0x1a, %xmm1, %xmm0
648 jbe LABEL(use_sse4_2_exit)
649 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
651 jbe LABEL(strcmp_exitz_sse4_2)
656 jg LABEL(nibble_ashr_3_use_sse4_2)
658 movdqa (%rdi, %rdx), %xmm0
659 palignr $3, -16(%rdi, %rdx), %xmm0
660 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
661 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
663 movdqa (%rsi,%rdx), %xmm1
664 TOLOWER (%xmm0, %xmm1)
665 pcmpistri $0x1a, %xmm1, %xmm0
667 jbe LABEL(use_sse4_2_exit)
668 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
670 jbe LABEL(strcmp_exitz_sse4_2)
673 jmp LABEL(loop_ashr_3_use_sse4_2)
676 LABEL(nibble_ashr_3_use_sse4_2):
678 movdqa -16(%rdi, %rdx), %xmm0
680 pcmpistri $0x3a,%xmm0, %xmm0
681 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
683 jae LABEL(nibble_ashr_use_sse4_2_exit)
686 ja LABEL(loop_ashr_3_use_sse4_2)
688 jmp LABEL(nibble_ashr_use_sse4_2_exit)
691 * The following cases will be handled by ashr_4
692 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
693 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
696 LABEL(ashr_4_sse4_2):
702 TOLOWER (%xmm1, %xmm2)
709 jnz LABEL(less32bytes_sse4_2)
712 UPDATE_STRNCMP_COUNTER
715 mov $16, %rcx /* index for loads */
716 mov $4, %r9d /* byte position left over from less32bytes case */
718 * Setup %r10 value allows us to detect crossing a page boundary.
719 * When %r10 goes positive we have crossed a page boundary and
720 * need to do a nibble.
723 and $0xfff, %r10 /* offset into 4K page */
724 sub $0x1000, %r10 /* subtract 4K pagesize */
725 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
728 LABEL(loop_ashr_4_use_sse4_2):
730 jg LABEL(nibble_ashr_4_use_sse4_2)
732 movdqa (%rdi, %rdx), %xmm0
733 palignr $4, -16(%rdi, %rdx), %xmm0
734 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
735 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
737 movdqa (%rsi,%rdx), %xmm1
738 TOLOWER (%xmm0, %xmm1)
739 pcmpistri $0x1a, %xmm1, %xmm0
741 jbe LABEL(use_sse4_2_exit)
742 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
744 jbe LABEL(strcmp_exitz_sse4_2)
749 jg LABEL(nibble_ashr_4_use_sse4_2)
751 movdqa (%rdi, %rdx), %xmm0
752 palignr $4, -16(%rdi, %rdx), %xmm0
753 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
754 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
756 movdqa (%rsi,%rdx), %xmm1
757 TOLOWER (%xmm0, %xmm1)
758 pcmpistri $0x1a, %xmm1, %xmm0
760 jbe LABEL(use_sse4_2_exit)
761 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
763 jbe LABEL(strcmp_exitz_sse4_2)
766 jmp LABEL(loop_ashr_4_use_sse4_2)
769 LABEL(nibble_ashr_4_use_sse4_2):
771 movdqa -16(%rdi, %rdx), %xmm0
773 pcmpistri $0x3a,%xmm0, %xmm0
774 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
776 jae LABEL(nibble_ashr_use_sse4_2_exit)
779 ja LABEL(loop_ashr_4_use_sse4_2)
781 jmp LABEL(nibble_ashr_use_sse4_2_exit)
784 * The following cases will be handled by ashr_5
785 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
786 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
789 LABEL(ashr_5_sse4_2):
795 TOLOWER (%xmm1, %xmm2)
802 jnz LABEL(less32bytes_sse4_2)
805 UPDATE_STRNCMP_COUNTER
808 mov $16, %rcx /* index for loads */
809 mov $5, %r9d /* byte position left over from less32bytes case */
811 * Setup %r10 value allows us to detect crossing a page boundary.
812 * When %r10 goes positive we have crossed a page boundary and
813 * need to do a nibble.
816 and $0xfff, %r10 /* offset into 4K page */
817 sub $0x1000, %r10 /* subtract 4K pagesize */
818 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
821 LABEL(loop_ashr_5_use_sse4_2):
823 jg LABEL(nibble_ashr_5_use_sse4_2)
825 movdqa (%rdi, %rdx), %xmm0
826 palignr $5, -16(%rdi, %rdx), %xmm0
827 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
828 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
830 movdqa (%rsi,%rdx), %xmm1
831 TOLOWER (%xmm0, %xmm1)
832 pcmpistri $0x1a, %xmm1, %xmm0
834 jbe LABEL(use_sse4_2_exit)
835 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
837 jbe LABEL(strcmp_exitz_sse4_2)
842 jg LABEL(nibble_ashr_5_use_sse4_2)
844 movdqa (%rdi, %rdx), %xmm0
846 palignr $5, -16(%rdi, %rdx), %xmm0
847 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
848 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
850 movdqa (%rsi,%rdx), %xmm1
851 TOLOWER (%xmm0, %xmm1)
852 pcmpistri $0x1a, %xmm1, %xmm0
854 jbe LABEL(use_sse4_2_exit)
855 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
857 jbe LABEL(strcmp_exitz_sse4_2)
860 jmp LABEL(loop_ashr_5_use_sse4_2)
863 LABEL(nibble_ashr_5_use_sse4_2):
865 movdqa -16(%rdi, %rdx), %xmm0
867 pcmpistri $0x3a,%xmm0, %xmm0
868 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
870 jae LABEL(nibble_ashr_use_sse4_2_exit)
873 ja LABEL(loop_ashr_5_use_sse4_2)
875 jmp LABEL(nibble_ashr_use_sse4_2_exit)
878 * The following cases will be handled by ashr_6
879 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
880 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
883 LABEL(ashr_6_sse4_2):
889 TOLOWER (%xmm1, %xmm2)
896 jnz LABEL(less32bytes_sse4_2)
899 UPDATE_STRNCMP_COUNTER
902 mov $16, %rcx /* index for loads */
903 mov $6, %r9d /* byte position left over from less32bytes case */
905 * Setup %r10 value allows us to detect crossing a page boundary.
906 * When %r10 goes positive we have crossed a page boundary and
907 * need to do a nibble.
910 and $0xfff, %r10 /* offset into 4K page */
911 sub $0x1000, %r10 /* subtract 4K pagesize */
912 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
915 LABEL(loop_ashr_6_use_sse4_2):
917 jg LABEL(nibble_ashr_6_use_sse4_2)
919 movdqa (%rdi, %rdx), %xmm0
920 palignr $6, -16(%rdi, %rdx), %xmm0
921 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
922 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
924 movdqa (%rsi,%rdx), %xmm1
925 TOLOWER (%xmm0, %xmm1)
926 pcmpistri $0x1a, %xmm1, %xmm0
928 jbe LABEL(use_sse4_2_exit)
929 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
931 jbe LABEL(strcmp_exitz_sse4_2)
936 jg LABEL(nibble_ashr_6_use_sse4_2)
938 movdqa (%rdi, %rdx), %xmm0
939 palignr $6, -16(%rdi, %rdx), %xmm0
940 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
941 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
943 movdqa (%rsi,%rdx), %xmm1
944 TOLOWER (%xmm0, %xmm1)
945 pcmpistri $0x1a, %xmm1, %xmm0
947 jbe LABEL(use_sse4_2_exit)
948 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
950 jbe LABEL(strcmp_exitz_sse4_2)
953 jmp LABEL(loop_ashr_6_use_sse4_2)
956 LABEL(nibble_ashr_6_use_sse4_2):
958 movdqa -16(%rdi, %rdx), %xmm0
960 pcmpistri $0x3a,%xmm0, %xmm0
961 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
963 jae LABEL(nibble_ashr_use_sse4_2_exit)
966 ja LABEL(loop_ashr_6_use_sse4_2)
968 jmp LABEL(nibble_ashr_use_sse4_2_exit)
971 * The following cases will be handled by ashr_7
972 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
973 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
976 LABEL(ashr_7_sse4_2):
982 TOLOWER (%xmm1, %xmm2)
989 jnz LABEL(less32bytes_sse4_2)
992 UPDATE_STRNCMP_COUNTER
995 mov $16, %rcx /* index for loads */
996 mov $7, %r9d /* byte position left over from less32bytes case */
998 * Setup %r10 value allows us to detect crossing a page boundary.
999 * When %r10 goes positive we have crossed a page boundary and
1000 * need to do a nibble.
1003 and $0xfff, %r10 /* offset into 4K page */
1004 sub $0x1000, %r10 /* subtract 4K pagesize */
1005 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1008 LABEL(loop_ashr_7_use_sse4_2):
1010 jg LABEL(nibble_ashr_7_use_sse4_2)
1012 movdqa (%rdi, %rdx), %xmm0
1013 palignr $7, -16(%rdi, %rdx), %xmm0
1014 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1015 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1017 movdqa (%rsi,%rdx), %xmm1
1018 TOLOWER (%xmm0, %xmm1)
1019 pcmpistri $0x1a, %xmm1, %xmm0
1021 jbe LABEL(use_sse4_2_exit)
1022 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1024 jbe LABEL(strcmp_exitz_sse4_2)
1029 jg LABEL(nibble_ashr_7_use_sse4_2)
1031 movdqa (%rdi, %rdx), %xmm0
1032 palignr $7, -16(%rdi, %rdx), %xmm0
1033 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1034 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1036 movdqa (%rsi,%rdx), %xmm1
1037 TOLOWER (%xmm0, %xmm1)
1038 pcmpistri $0x1a, %xmm1, %xmm0
1040 jbe LABEL(use_sse4_2_exit)
1041 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1043 jbe LABEL(strcmp_exitz_sse4_2)
1046 jmp LABEL(loop_ashr_7_use_sse4_2)
1049 LABEL(nibble_ashr_7_use_sse4_2):
1051 movdqa -16(%rdi, %rdx), %xmm0
1053 pcmpistri $0x3a,%xmm0, %xmm0
1054 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1056 jae LABEL(nibble_ashr_use_sse4_2_exit)
1059 ja LABEL(loop_ashr_7_use_sse4_2)
1061 jmp LABEL(nibble_ashr_use_sse4_2_exit)
1064 * The following cases will be handled by ashr_8
1065 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1066 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
1069 LABEL(ashr_8_sse4_2):
1071 movdqa (%rdi), %xmm2
1072 movdqa (%rsi), %xmm1
1073 pcmpeqb %xmm1, %xmm0
1075 TOLOWER (%xmm1, %xmm2)
1076 pcmpeqb %xmm1, %xmm2
1078 pmovmskb %xmm2, %r9d
1082 jnz LABEL(less32bytes_sse4_2)
1083 movdqa (%rdi), %xmm3
1085 UPDATE_STRNCMP_COUNTER
1088 mov $16, %rcx /* index for loads */
1089 mov $8, %r9d /* byte position left over from less32bytes case */
1091 * Setup %r10 value allows us to detect crossing a page boundary.
1092 * When %r10 goes positive we have crossed a page boundary and
1093 * need to do a nibble.
1096 and $0xfff, %r10 /* offset into 4K page */
1097 sub $0x1000, %r10 /* subtract 4K pagesize */
1098 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1101 LABEL(loop_ashr_8_use_sse4_2):
1103 jg LABEL(nibble_ashr_8_use_sse4_2)
1105 movdqa (%rdi, %rdx), %xmm0
1106 palignr $8, -16(%rdi, %rdx), %xmm0
1107 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1108 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1110 movdqa (%rsi,%rdx), %xmm1
1111 TOLOWER (%xmm0, %xmm1)
1112 pcmpistri $0x1a, %xmm1, %xmm0
1114 jbe LABEL(use_sse4_2_exit)
1115 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1117 jbe LABEL(strcmp_exitz_sse4_2)
1122 jg LABEL(nibble_ashr_8_use_sse4_2)
1124 movdqa (%rdi, %rdx), %xmm0
1125 palignr $8, -16(%rdi, %rdx), %xmm0
1126 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1127 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1129 movdqa (%rsi,%rdx), %xmm1
1130 TOLOWER (%xmm0, %xmm1)
1131 pcmpistri $0x1a, %xmm1, %xmm0
1133 jbe LABEL(use_sse4_2_exit)
1134 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1136 jbe LABEL(strcmp_exitz_sse4_2)
1139 jmp LABEL(loop_ashr_8_use_sse4_2)
1142 LABEL(nibble_ashr_8_use_sse4_2):
1144 movdqa -16(%rdi, %rdx), %xmm0
1146 pcmpistri $0x3a,%xmm0, %xmm0
1147 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1149 jae LABEL(nibble_ashr_use_sse4_2_exit)
1152 ja LABEL(loop_ashr_8_use_sse4_2)
1154 jmp LABEL(nibble_ashr_use_sse4_2_exit)
1157 * The following cases will be handled by ashr_9
1158 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1159 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
1162 LABEL(ashr_9_sse4_2):
1164 movdqa (%rdi), %xmm2
1165 movdqa (%rsi), %xmm1
1166 pcmpeqb %xmm1, %xmm0
1168 TOLOWER (%xmm1, %xmm2)
1169 pcmpeqb %xmm1, %xmm2
1171 pmovmskb %xmm2, %r9d
1175 jnz LABEL(less32bytes_sse4_2)
1176 movdqa (%rdi), %xmm3
1178 UPDATE_STRNCMP_COUNTER
1181 mov $16, %rcx /* index for loads */
1182 mov $9, %r9d /* byte position left over from less32bytes case */
1184 * Setup %r10 value allows us to detect crossing a page boundary.
1185 * When %r10 goes positive we have crossed a page boundary and
1186 * need to do a nibble.
1189 and $0xfff, %r10 /* offset into 4K page */
1190 sub $0x1000, %r10 /* subtract 4K pagesize */
1191 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1194 LABEL(loop_ashr_9_use_sse4_2):
1196 jg LABEL(nibble_ashr_9_use_sse4_2)
1198 movdqa (%rdi, %rdx), %xmm0
1200 palignr $9, -16(%rdi, %rdx), %xmm0
1201 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1202 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1204 movdqa (%rsi,%rdx), %xmm1
1205 TOLOWER (%xmm0, %xmm1)
1206 pcmpistri $0x1a, %xmm1, %xmm0
1208 jbe LABEL(use_sse4_2_exit)
1209 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1211 jbe LABEL(strcmp_exitz_sse4_2)
1216 jg LABEL(nibble_ashr_9_use_sse4_2)
1218 movdqa (%rdi, %rdx), %xmm0
1219 palignr $9, -16(%rdi, %rdx), %xmm0
1220 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1221 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1223 movdqa (%rsi,%rdx), %xmm1
1224 TOLOWER (%xmm0, %xmm1)
1225 pcmpistri $0x1a, %xmm1, %xmm0
1227 jbe LABEL(use_sse4_2_exit)
1228 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1230 jbe LABEL(strcmp_exitz_sse4_2)
1233 jmp LABEL(loop_ashr_9_use_sse4_2)
1236 LABEL(nibble_ashr_9_use_sse4_2):
1238 movdqa -16(%rdi, %rdx), %xmm0
1240 pcmpistri $0x3a,%xmm0, %xmm0
1241 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1243 jae LABEL(nibble_ashr_use_sse4_2_exit)
1246 ja LABEL(loop_ashr_9_use_sse4_2)
1248 jmp LABEL(nibble_ashr_use_sse4_2_exit)
1251 * The following cases will be handled by ashr_10
1252 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1253 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
1256 LABEL(ashr_10_sse4_2):
1258 movdqa (%rdi), %xmm2
1259 movdqa (%rsi), %xmm1
1260 pcmpeqb %xmm1, %xmm0
1262 TOLOWER (%xmm1, %xmm2)
1263 pcmpeqb %xmm1, %xmm2
1265 pmovmskb %xmm2, %r9d
1269 jnz LABEL(less32bytes_sse4_2)
1270 movdqa (%rdi), %xmm3
1272 UPDATE_STRNCMP_COUNTER
1275 mov $16, %rcx /* index for loads */
1276 mov $10, %r9d /* byte position left over from less32bytes case */
1278 * Setup %r10 value allows us to detect crossing a page boundary.
1279 * When %r10 goes positive we have crossed a page boundary and
1280 * need to do a nibble.
1283 and $0xfff, %r10 /* offset into 4K page */
1284 sub $0x1000, %r10 /* subtract 4K pagesize */
1285 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1288 LABEL(loop_ashr_10_use_sse4_2):
1290 jg LABEL(nibble_ashr_10_use_sse4_2)
1292 movdqa (%rdi, %rdx), %xmm0
1293 palignr $10, -16(%rdi, %rdx), %xmm0
1294 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1295 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1297 movdqa (%rsi,%rdx), %xmm1
1298 TOLOWER (%xmm0, %xmm1)
1299 pcmpistri $0x1a, %xmm1, %xmm0
1301 jbe LABEL(use_sse4_2_exit)
1302 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1304 jbe LABEL(strcmp_exitz_sse4_2)
1309 jg LABEL(nibble_ashr_10_use_sse4_2)
1311 movdqa (%rdi, %rdx), %xmm0
1312 palignr $10, -16(%rdi, %rdx), %xmm0
1313 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1314 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1316 movdqa (%rsi,%rdx), %xmm1
1317 TOLOWER (%xmm0, %xmm1)
1318 pcmpistri $0x1a, %xmm1, %xmm0
1320 jbe LABEL(use_sse4_2_exit)
1321 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1323 jbe LABEL(strcmp_exitz_sse4_2)
1326 jmp LABEL(loop_ashr_10_use_sse4_2)
1329 LABEL(nibble_ashr_10_use_sse4_2):
1331 movdqa -16(%rdi, %rdx), %xmm0
1333 pcmpistri $0x3a,%xmm0, %xmm0
1334 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1336 jae LABEL(nibble_ashr_use_sse4_2_exit)
1339 ja LABEL(loop_ashr_10_use_sse4_2)
1341 jmp LABEL(nibble_ashr_use_sse4_2_exit)
1344 * The following cases will be handled by ashr_11
1345 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1346 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
1349 LABEL(ashr_11_sse4_2):
1351 movdqa (%rdi), %xmm2
1352 movdqa (%rsi), %xmm1
1353 pcmpeqb %xmm1, %xmm0
1355 TOLOWER (%xmm1, %xmm2)
1356 pcmpeqb %xmm1, %xmm2
1358 pmovmskb %xmm2, %r9d
1362 jnz LABEL(less32bytes_sse4_2)
1363 movdqa (%rdi), %xmm3
1365 UPDATE_STRNCMP_COUNTER
1368 mov $16, %rcx /* index for loads */
1369 mov $11, %r9d /* byte position left over from less32bytes case */
1371 * Setup %r10 value allows us to detect crossing a page boundary.
1372 * When %r10 goes positive we have crossed a page boundary and
1373 * need to do a nibble.
1376 and $0xfff, %r10 /* offset into 4K page */
1377 sub $0x1000, %r10 /* subtract 4K pagesize */
1378 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1381 LABEL(loop_ashr_11_use_sse4_2):
1383 jg LABEL(nibble_ashr_11_use_sse4_2)
1385 movdqa (%rdi, %rdx), %xmm0
1386 palignr $11, -16(%rdi, %rdx), %xmm0
1387 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1388 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1390 movdqa (%rsi,%rdx), %xmm1
1391 TOLOWER (%xmm0, %xmm1)
1392 pcmpistri $0x1a, %xmm1, %xmm0
1394 jbe LABEL(use_sse4_2_exit)
1395 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1397 jbe LABEL(strcmp_exitz_sse4_2)
1402 jg LABEL(nibble_ashr_11_use_sse4_2)
1404 movdqa (%rdi, %rdx), %xmm0
1405 palignr $11, -16(%rdi, %rdx), %xmm0
1406 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1407 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1409 movdqa (%rsi,%rdx), %xmm1
1410 TOLOWER (%xmm0, %xmm1)
1411 pcmpistri $0x1a, %xmm1, %xmm0
1413 jbe LABEL(use_sse4_2_exit)
1414 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1416 jbe LABEL(strcmp_exitz_sse4_2)
1419 jmp LABEL(loop_ashr_11_use_sse4_2)
1422 LABEL(nibble_ashr_11_use_sse4_2):
1424 movdqa -16(%rdi, %rdx), %xmm0
1426 pcmpistri $0x3a,%xmm0, %xmm0
1427 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1429 jae LABEL(nibble_ashr_use_sse4_2_exit)
1432 ja LABEL(loop_ashr_11_use_sse4_2)
1434 jmp LABEL(nibble_ashr_use_sse4_2_exit)
1437 * The following cases will be handled by ashr_12
1438 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1439 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
1442 LABEL(ashr_12_sse4_2):
1444 movdqa (%rdi), %xmm2
1445 movdqa (%rsi), %xmm1
1446 pcmpeqb %xmm1, %xmm0
1448 TOLOWER (%xmm1, %xmm2)
1449 pcmpeqb %xmm1, %xmm2
1451 pmovmskb %xmm2, %r9d
1455 jnz LABEL(less32bytes_sse4_2)
1456 movdqa (%rdi), %xmm3
1458 UPDATE_STRNCMP_COUNTER
1461 mov $16, %rcx /* index for loads */
1462 mov $12, %r9d /* byte position left over from less32bytes case */
1464 * Setup %r10 value allows us to detect crossing a page boundary.
1465 * When %r10 goes positive we have crossed a page boundary and
1466 * need to do a nibble.
1469 and $0xfff, %r10 /* offset into 4K page */
1470 sub $0x1000, %r10 /* subtract 4K pagesize */
1471 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1474 LABEL(loop_ashr_12_use_sse4_2):
1476 jg LABEL(nibble_ashr_12_use_sse4_2)
1478 movdqa (%rdi, %rdx), %xmm0
1479 palignr $12, -16(%rdi, %rdx), %xmm0
1480 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1481 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1483 movdqa (%rsi,%rdx), %xmm1
1484 TOLOWER (%xmm0, %xmm1)
1485 pcmpistri $0x1a, %xmm1, %xmm0
1487 jbe LABEL(use_sse4_2_exit)
1488 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1490 jbe LABEL(strcmp_exitz_sse4_2)
1495 jg LABEL(nibble_ashr_12_use_sse4_2)
1497 movdqa (%rdi, %rdx), %xmm0
1498 palignr $12, -16(%rdi, %rdx), %xmm0
1499 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1500 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1502 movdqa (%rsi,%rdx), %xmm1
1503 TOLOWER (%xmm0, %xmm1)
1504 pcmpistri $0x1a, %xmm1, %xmm0
1506 jbe LABEL(use_sse4_2_exit)
1507 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1509 jbe LABEL(strcmp_exitz_sse4_2)
1512 jmp LABEL(loop_ashr_12_use_sse4_2)
1515 LABEL(nibble_ashr_12_use_sse4_2):
1517 movdqa -16(%rdi, %rdx), %xmm0
1519 pcmpistri $0x3a,%xmm0, %xmm0
1520 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1522 jae LABEL(nibble_ashr_use_sse4_2_exit)
1525 ja LABEL(loop_ashr_12_use_sse4_2)
1527 jmp LABEL(nibble_ashr_use_sse4_2_exit)
1530 * The following cases will be handled by ashr_13
1531 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1532 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
1535 LABEL(ashr_13_sse4_2):
1537 movdqa (%rdi), %xmm2
1538 movdqa (%rsi), %xmm1
1539 pcmpeqb %xmm1, %xmm0
1541 TOLOWER (%xmm1, %xmm2)
1542 pcmpeqb %xmm1, %xmm2
1544 pmovmskb %xmm2, %r9d
1548 jnz LABEL(less32bytes_sse4_2)
1549 movdqa (%rdi), %xmm3
1551 UPDATE_STRNCMP_COUNTER
1554 mov $16, %rcx /* index for loads */
1555 mov $13, %r9d /* byte position left over from less32bytes case */
1557 * Setup %r10 value allows us to detect crossing a page boundary.
1558 * When %r10 goes positive we have crossed a page boundary and
1559 * need to do a nibble.
1562 and $0xfff, %r10 /* offset into 4K page */
1563 sub $0x1000, %r10 /* subtract 4K pagesize */
1565 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1568 LABEL(loop_ashr_13_use_sse4_2):
1570 jg LABEL(nibble_ashr_13_use_sse4_2)
1572 movdqa (%rdi, %rdx), %xmm0
1573 palignr $13, -16(%rdi, %rdx), %xmm0
1574 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1575 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1577 movdqa (%rsi,%rdx), %xmm1
1578 TOLOWER (%xmm0, %xmm1)
1579 pcmpistri $0x1a, %xmm1, %xmm0
1581 jbe LABEL(use_sse4_2_exit)
1582 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1584 jbe LABEL(strcmp_exitz_sse4_2)
1589 jg LABEL(nibble_ashr_13_use_sse4_2)
1591 movdqa (%rdi, %rdx), %xmm0
1592 palignr $13, -16(%rdi, %rdx), %xmm0
1593 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1594 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1596 movdqa (%rsi,%rdx), %xmm1
1597 TOLOWER (%xmm0, %xmm1)
1598 pcmpistri $0x1a, %xmm1, %xmm0
1600 jbe LABEL(use_sse4_2_exit)
1601 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1603 jbe LABEL(strcmp_exitz_sse4_2)
1606 jmp LABEL(loop_ashr_13_use_sse4_2)
1609 LABEL(nibble_ashr_13_use_sse4_2):
1611 movdqa -16(%rdi, %rdx), %xmm0
1613 pcmpistri $0x3a,%xmm0, %xmm0
1614 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1616 jae LABEL(nibble_ashr_use_sse4_2_exit)
1619 ja LABEL(loop_ashr_13_use_sse4_2)
1621 jmp LABEL(nibble_ashr_use_sse4_2_exit)
1624 * The following cases will be handled by ashr_14
1625 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1626 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
1629 LABEL(ashr_14_sse4_2):
1631 movdqa (%rdi), %xmm2
1632 movdqa (%rsi), %xmm1
1633 pcmpeqb %xmm1, %xmm0
1635 TOLOWER (%xmm1, %xmm2)
1636 pcmpeqb %xmm1, %xmm2
1638 pmovmskb %xmm2, %r9d
1642 jnz LABEL(less32bytes_sse4_2)
1643 movdqa (%rdi), %xmm3
1645 UPDATE_STRNCMP_COUNTER
1648 mov $16, %rcx /* index for loads */
1649 mov $14, %r9d /* byte position left over from less32bytes case */
1651 * Setup %r10 value allows us to detect crossing a page boundary.
1652 * When %r10 goes positive we have crossed a page boundary and
1653 * need to do a nibble.
1656 and $0xfff, %r10 /* offset into 4K page */
1657 sub $0x1000, %r10 /* subtract 4K pagesize */
1659 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1662 LABEL(loop_ashr_14_use_sse4_2):
1664 jg LABEL(nibble_ashr_14_use_sse4_2)
1666 movdqa (%rdi, %rdx), %xmm0
1667 palignr $14, -16(%rdi, %rdx), %xmm0
1668 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1669 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1671 movdqa (%rsi,%rdx), %xmm1
1672 TOLOWER (%xmm0, %xmm1)
1673 pcmpistri $0x1a, %xmm1, %xmm0
1675 jbe LABEL(use_sse4_2_exit)
1676 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1678 jbe LABEL(strcmp_exitz_sse4_2)
1683 jg LABEL(nibble_ashr_14_use_sse4_2)
1685 movdqa (%rdi, %rdx), %xmm0
1686 palignr $14, -16(%rdi, %rdx), %xmm0
1687 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1688 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1690 movdqa (%rsi,%rdx), %xmm1
1691 TOLOWER (%xmm0, %xmm1)
1692 pcmpistri $0x1a, %xmm1, %xmm0
1694 jbe LABEL(use_sse4_2_exit)
1695 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1697 jbe LABEL(strcmp_exitz_sse4_2)
1700 jmp LABEL(loop_ashr_14_use_sse4_2)
1703 LABEL(nibble_ashr_14_use_sse4_2):
1705 movdqa -16(%rdi, %rdx), %xmm0
1707 pcmpistri $0x3a,%xmm0, %xmm0
1708 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1710 jae LABEL(nibble_ashr_use_sse4_2_exit)
1713 ja LABEL(loop_ashr_14_use_sse4_2)
1715 jmp LABEL(nibble_ashr_use_sse4_2_exit)
1718 * The following cases will be handled by ashr_15
1719 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1720 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
1723 LABEL(ashr_15_sse4_2):
1725 movdqa (%rdi), %xmm2
1726 movdqa (%rsi), %xmm1
1727 pcmpeqb %xmm1, %xmm0
1729 TOLOWER (%xmm1, %xmm2)
1730 pcmpeqb %xmm1, %xmm2
1732 pmovmskb %xmm2, %r9d
1736 jnz LABEL(less32bytes_sse4_2)
1738 movdqa (%rdi), %xmm3
1740 UPDATE_STRNCMP_COUNTER
1743 mov $16, %rcx /* index for loads */
1744 mov $15, %r9d /* byte position left over from less32bytes case */
1746 * Setup %r10 value allows us to detect crossing a page boundary.
1747 * When %r10 goes positive we have crossed a page boundary and
1748 * need to do a nibble.
1751 and $0xfff, %r10 /* offset into 4K page */
1753 sub $0x1000, %r10 /* subtract 4K pagesize */
1755 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1758 LABEL(loop_ashr_15_use_sse4_2):
1760 jg LABEL(nibble_ashr_15_use_sse4_2)
1762 movdqa (%rdi, %rdx), %xmm0
1763 palignr $15, -16(%rdi, %rdx), %xmm0
1764 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1765 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1767 movdqa (%rsi,%rdx), %xmm1
1768 TOLOWER (%xmm0, %xmm1)
1769 pcmpistri $0x1a, %xmm1, %xmm0
1771 jbe LABEL(use_sse4_2_exit)
1772 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1774 jbe LABEL(strcmp_exitz_sse4_2)
1779 jg LABEL(nibble_ashr_15_use_sse4_2)
1781 movdqa (%rdi, %rdx), %xmm0
1782 palignr $15, -16(%rdi, %rdx), %xmm0
1783 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1784 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1786 movdqa (%rsi,%rdx), %xmm1
1787 TOLOWER (%xmm0, %xmm1)
1788 pcmpistri $0x1a, %xmm1, %xmm0
1790 jbe LABEL(use_sse4_2_exit)
1791 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1793 jbe LABEL(strcmp_exitz_sse4_2)
1796 jmp LABEL(loop_ashr_15_use_sse4_2)
1799 LABEL(nibble_ashr_15_use_sse4_2):
1801 movdqa -16(%rdi, %rdx), %xmm0
1803 pcmpistri $0x3a,%xmm0, %xmm0
1804 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1806 jae LABEL(nibble_ashr_use_sse4_2_exit)
1809 ja LABEL(loop_ashr_15_use_sse4_2)
1811 LABEL(nibble_ashr_use_sse4_2_exit):
1812 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1813 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1815 movdqa (%rsi,%rdx), %xmm1
1816 TOLOWER (%xmm0, %xmm1)
1817 pcmpistri $0x1a, %xmm1, %xmm0
1820 LABEL(use_sse4_2_exit):
1821 jnc LABEL(strcmp_exitz_sse4_2)
1822 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1824 jbe LABEL(strcmp_exitz_sse4_2)
1827 lea -16(%rdi, %r9), %rdi
1828 movzbl (%rdi, %rdx), %eax
1829 movzbl (%rsi, %rdx), %edx
1831 jz LABEL(use_sse4_2_ret_sse4_2)
1833 LABEL(use_sse4_2_ret_sse4_2):
1834 # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1835 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
1836 movl (%rcx,%rdx,4), %edx
1837 movl (%rcx,%rax,4), %eax
1843 LABEL(less32bytes_sse4_2):
1844 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
1845 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
1847 jz LABEL(ret_sse4_2)
1848 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
1852 LABEL(less16bytes_sse4_2):
1853 bsf %rdx, %rdx /* find and store bit index in %rdx */
1855 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1857 jbe LABEL(strcmp_exitz_sse4_2)
1859 movzbl (%rsi, %rdx), %ecx
1860 movzbl (%rdi, %rdx), %eax
1862 # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1863 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1864 movl (%rdx,%rcx,4), %ecx
1865 movl (%rdx,%rax,4), %eax
1871 LABEL(strcmp_exitz_sse4_2):
1876 // XXX Same as code above
1877 LABEL(Byte0_sse4_2):
1881 # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1882 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1883 movl (%rdx,%rcx,4), %ecx
1884 movl (%rdx,%rax,4), %eax
1890 .size STRCMP_SSE42, .-STRCMP_SSE42
1897 /* Put all SSE 4.2 functions together. */
1898 .section .rodata.sse4.2,"a",@progbits
1900 LABEL(unaligned_table_sse4_2):
1901 .int LABEL(ashr_1_sse4_2) - LABEL(unaligned_table_sse4_2)
1902 .int LABEL(ashr_2_sse4_2) - LABEL(unaligned_table_sse4_2)
1903 .int LABEL(ashr_3_sse4_2) - LABEL(unaligned_table_sse4_2)
1904 .int LABEL(ashr_4_sse4_2) - LABEL(unaligned_table_sse4_2)
1905 .int LABEL(ashr_5_sse4_2) - LABEL(unaligned_table_sse4_2)
1906 .int LABEL(ashr_6_sse4_2) - LABEL(unaligned_table_sse4_2)
1907 .int LABEL(ashr_7_sse4_2) - LABEL(unaligned_table_sse4_2)
1908 .int LABEL(ashr_8_sse4_2) - LABEL(unaligned_table_sse4_2)
1909 .int LABEL(ashr_9_sse4_2) - LABEL(unaligned_table_sse4_2)
1910 .int LABEL(ashr_10_sse4_2) - LABEL(unaligned_table_sse4_2)
1911 .int LABEL(ashr_11_sse4_2) - LABEL(unaligned_table_sse4_2)
1912 .int LABEL(ashr_12_sse4_2) - LABEL(unaligned_table_sse4_2)
1913 .int LABEL(ashr_13_sse4_2) - LABEL(unaligned_table_sse4_2)
1914 .int LABEL(ashr_14_sse4_2) - LABEL(unaligned_table_sse4_2)
1915 .int LABEL(ashr_15_sse4_2) - LABEL(unaligned_table_sse4_2)
1916 .int LABEL(ashr_0_sse4_2) - LABEL(unaligned_table_sse4_2)
1920 # define ENTRY(name) \
1921 .type STRCMP_SSE2, @function; \
1923 STRCMP_SSE2: cfi_startproc; \
1926 # define END(name) \
1927 cfi_endproc; .size STRCMP_SSE2, .-STRCMP_SSE2
1929 # ifdef USE_AS_STRCASECMP_L
1930 # define ENTRY2(name) \
1931 .type __strcasecmp_sse2, @function; \
1933 __strcasecmp_sse2: cfi_startproc; \
1935 # define END2(name) \
1936 cfi_endproc; .size __strcasecmp_sse2, .-__strcasecmp_sse2
1939 # ifdef USE_AS_STRNCASECMP_L
1940 # define ENTRY2(name) \
1941 .type __strncasecmp_sse2, @function; \
1943 __strncasecmp_sse2: cfi_startproc; \
1945 # define END2(name) \
1946 cfi_endproc; .size __strncasecmp_sse2, .-__strncasecmp_sse2
1949 # undef libc_hidden_builtin_def
1950 /* It doesn't make sense to send libc-internal strcmp calls through a PLT.
1951 The speedup we get from using SSE4.2 instruction is likely eaten away
1952 by the indirect call in the PLT. */
1953 # define libc_hidden_builtin_def(name) \
1954 .globl __GI_STRCMP; __GI_STRCMP = STRCMP_SSE2
1957 #include "../strcmp.S"