2 Copyright (C) 2009, 2010 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
22 #include <init-arch.h>
25 /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
26 if the new counter > the old one or is 0. */
27 # define UPDATE_STRNCMP_COUNTER \
28 /* calculate left number to compare */ \
29 lea -16(%rcx, %r11), %r9; \
31 jb LABEL(strcmp_exitz_sse4_2); \
33 je LABEL(strcmp_exitz_sse4_2); \
36 # define STRCMP_SSE42 __strncmp_sse42
37 # define STRCMP_SSSE3 __strncmp_ssse3
38 # define STRCMP_SSE2 __strncmp_sse2
39 # define __GI_STRCMP __GI_strncmp
40 #elif defined USE_AS_STRCASECMP_L
41 # include "locale-defines.h"
43 # define UPDATE_STRNCMP_COUNTER
45 # define STRCMP_SSE42 __strcasecmp_l_sse42
46 # define STRCMP_SSSE3 __strcasecmp_l_ssse3
47 # define STRCMP_SSE2 __strcasecmp_l_sse2
48 # define __GI_STRCMP __GI___strcasecmp_l
49 #elif defined USE_AS_STRNCASECMP_L
50 # include "locale-defines.h"
52 /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
53 if the new counter > the old one or is 0. */
54 # define UPDATE_STRNCMP_COUNTER \
55 /* calculate left number to compare */ \
56 lea -16(%rcx, %r11), %r9; \
58 jb LABEL(strcmp_exitz_sse4_2); \
60 je LABEL(strcmp_exitz_sse4_2); \
63 # define STRCMP_SSE42 __strncasecmp_l_sse42
64 # define STRCMP_SSSE3 __strncasecmp_l_ssse3
65 # define STRCMP_SSE2 __strncasecmp_l_sse2
66 # define __GI_STRCMP __GI___strncasecmp_l
68 # define UPDATE_STRNCMP_COUNTER
70 # define STRCMP strcmp
71 # define STRCMP_SSE42 __strcmp_sse42
72 # define STRCMP_SSSE3 __strcmp_ssse3
73 # define STRCMP_SSE2 __strcmp_sse2
74 # define __GI_STRCMP __GI_strcmp
79 # define LABEL(l) L(l)
82 /* Define multiple versions only for the definition in libc. Don't
83 define multiple versions for strncmp in static library since we
84 need strncmp before the initialization happened. */
85 #if (defined SHARED || !defined USE_AS_STRNCMP) && !defined NOT_IN_libc
88 .type STRCMP, @gnu_indirect_function
89 cmpl $0, __cpu_features+KIND_OFFSET(%rip)
91 call __init_cpu_features
93 leaq STRCMP_SSE42(%rip), %rax
94 testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
96 leaq STRCMP_SSSE3(%rip), %rax
97 testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
99 leaq STRCMP_SSE2(%rip), %rax
103 # ifdef USE_AS_STRCASECMP_L
105 .type __strcasecmp, @gnu_indirect_function
106 cmpl $0, __cpu_features+KIND_OFFSET(%rip)
108 call __init_cpu_features
110 leaq __strcasecmp_sse42(%rip), %rax
111 testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
113 leaq __strcasecmp_ssse3(%rip), %rax
114 testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
116 leaq __strcasecmp_sse2(%rip), %rax
119 weak_alias (__strcasecmp, strcasecmp)
121 # ifdef USE_AS_STRNCASECMP_L
123 .type __strncasecmp, @gnu_indirect_function
124 cmpl $0, __cpu_features+KIND_OFFSET(%rip)
126 call __init_cpu_features
128 leaq __strncasecmp_sse42(%rip), %rax
129 testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
131 leaq __strncasecmp_ssse3(%rip), %rax
132 testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
134 leaq __strncasecmp_sse2(%rip), %rax
137 weak_alias (__strncasecmp, strncasecmp)
142 | _SIDD_CMP_EQUAL_EACH
143 | _SIDD_NEGATIVE_POLARITY
144 | _SIDD_LEAST_SIGNIFICANT
145 on pcmpistri to find out if two 16byte data elements are the same
146 and the offset of the first different byte. There are 4 cases:
148 1. Both 16byte data elements are valid and identical.
149 2. Both 16byte data elements have EOS and identical.
150 3. Both 16byte data elements are valid and they differ at offset X.
151 4. At least one 16byte data element has EOS at offset X. Two 16byte
152 data elements must differ at or before offset X.
154 Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
156 case ECX CFlag ZFlag SFlag
162 We exit from the loop for cases 2, 3 and 4 with jbe which branches
163 when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for
166 /* Put all SSE 4.2 functions together. */
167 .section .text.sse4.2,"ax",@progbits
169 .type STRCMP_SSE42, @function
170 # ifdef USE_AS_STRCASECMP_L
171 ENTRY (__strcasecmp_sse42)
172 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
175 // XXX 5 byte should be before the function
177 .byte 0x0f,0x1f,0x44,0x00,0x00
178 END (__strcasecmp_sse42)
179 /* FALLTHROUGH to strcasecmp_l. */
181 # ifdef USE_AS_STRNCASECMP_L
182 ENTRY (__strncasecmp_sse42)
183 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
186 // XXX 5 byte should be before the function
188 .byte 0x0f,0x1f,0x44,0x00,0x00
189 END (__strncasecmp_sse42)
190 /* FALLTHROUGH to strncasecmp_l. */
198 * This implementation uses SSE to compare up to 16 bytes at a time.
200 # ifdef USE_AS_STRCASECMP_L
201 /* We have to fall back on the C implementation for locales
202 with encodings not matching ASCII for single bytes. */
203 # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
204 movq LOCALE_T___LOCALES+LC_CTYPE*8(%rdx), %rax
208 testl $0, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
209 jne __strcasecmp_l_nonascii
211 # ifdef USE_AS_STRNCASECMP_L
212 /* We have to fall back on the C implementation for locales
213 with encodings not matching ASCII for single bytes. */
214 # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
215 movq LOCALE_T___LOCALES+LC_CTYPE*8(%rcx), %rax
219 testl $0, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
220 jne __strncasecmp_l_nonascii
223 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
225 je LABEL(strcmp_exitz_sse4_2)
227 je LABEL(Byte0_sse4_2)
232 /* Use 64bit AND here to avoid long NOP padding. */
233 and $0x3f, %rcx /* rsi alignment in cache line */
234 and $0x3f, %rax /* rdi alignment in cache line */
235 # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
236 .section .rodata.cst16,"aM",@progbits,16
239 .quad 0x4040404040404040
240 .quad 0x4040404040404040
242 .quad 0x5b5b5b5b5b5b5b5b
243 .quad 0x5b5b5b5b5b5b5b5b
245 .quad 0x2020202020202020
246 .quad 0x2020202020202020
248 movdqa .Lbelowupper_sse4(%rip), %xmm4
249 # define UCLOW_reg %xmm4
250 movdqa .Ltopupper_sse4(%rip), %xmm5
251 # define UCHIGH_reg %xmm5
252 movdqa .Ltouppermask_sse4(%rip), %xmm6
253 # define LCQWORD_reg %xmm6
256 ja LABEL(crosscache_sse4_2)/* rsi: 16-byte load will cross cache line */
258 ja LABEL(crosscache_sse4_2)/* rdi: 16-byte load will cross cache line */
261 # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
262 # define TOLOWER(reg1, reg2) \
263 movdqa reg1, %xmm7; \
264 movdqa UCHIGH_reg, %xmm8; \
265 movdqa reg2, %xmm9; \
266 movdqa UCHIGH_reg, %xmm10; \
267 pcmpgtb UCLOW_reg, %xmm7; \
268 pcmpgtb reg1, %xmm8; \
269 pcmpgtb UCLOW_reg, %xmm9; \
270 pcmpgtb reg2, %xmm10; \
272 pand %xmm10, %xmm9; \
273 pand LCQWORD_reg, %xmm7; \
274 pand LCQWORD_reg, %xmm9; \
277 TOLOWER (%xmm1, %xmm2)
279 # define TOLOWER(reg1, reg2)
281 pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
282 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
283 pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
284 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
286 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
287 jnz LABEL(less16bytes_sse4_2)/* If not, find different value or null char */
288 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
290 jbe LABEL(strcmp_exitz_sse4_2)/* finish comparision */
292 add $16, %rsi /* prepare to search next 16 bytes */
293 add $16, %rdi /* prepare to search next 16 bytes */
296 * Determine source and destination string offsets from 16-byte alignment.
297 * Use relative offset difference between the two to determine which case
301 LABEL(crosscache_sse4_2):
302 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
303 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
304 mov $0xffff, %edx /* for equivalent offset */
306 and $0xf, %ecx /* offset of rsi */
307 and $0xf, %eax /* offset of rdi */
309 je LABEL(ashr_0_sse4_2) /* rsi and rdi relative offset same */
310 ja LABEL(bigger_sse4_2)
311 mov %edx, %r8d /* r8d is offset flag for exit tail */
314 LABEL(bigger_sse4_2):
317 lea LABEL(unaligned_table_sse4_2)(%rip), %r10
318 movslq (%r10, %r9,4), %r9
319 lea (%r10, %r9), %r10
320 jmp *%r10 /* jump to corresponding case */
323 * The following cases will be handled by ashr_0
324 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
325 * n(0~15) n(0~15) 15(15+ n-n) ashr_0
328 LABEL(ashr_0_sse4_2):
331 pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
332 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
333 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
334 pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
337 TOLOWER (%xmm1, %xmm2)
338 pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */
340 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
342 shr %cl, %edx /* adjust 0xffff for offset */
343 shr %cl, %r9d /* adjust for 16-byte offset */
346 * edx must be the same with r9d if in left byte (16-rcx) is equal to
347 * the start from (16-rax) and no null char was seen.
349 jne LABEL(less32bytes_sse4_2) /* mismatch or null char */
350 UPDATE_STRNCMP_COUNTER
353 pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */
356 * Now both strings are aligned at 16-byte boundary. Loop over strings
357 * checking 32-bytes per iteration.
359 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
361 LABEL(ashr_0_use_sse4_2):
362 movdqa (%rdi,%rdx), %xmm0
363 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
364 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
366 movdqa (%rsi,%rdx), %xmm1
367 TOLOWER (%xmm0, %xmm1)
368 pcmpistri $0x1a, %xmm1, %xmm0
371 jbe LABEL(ashr_0_use_sse4_2_exit)
372 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
374 jbe LABEL(strcmp_exitz_sse4_2)
377 movdqa (%rdi,%rdx), %xmm0
378 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
379 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
381 movdqa (%rsi,%rdx), %xmm1
382 TOLOWER (%xmm0, %xmm1)
383 pcmpistri $0x1a, %xmm1, %xmm0
386 jbe LABEL(ashr_0_use_sse4_2_exit)
387 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
389 jbe LABEL(strcmp_exitz_sse4_2)
391 jmp LABEL(ashr_0_use_sse4_2)
395 LABEL(ashr_0_use_sse4_2_exit):
396 jnc LABEL(strcmp_exitz_sse4_2)
397 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
399 jbe LABEL(strcmp_exitz_sse4_2)
401 lea -16(%rdx, %rcx), %rcx
402 movzbl (%rdi, %rcx), %eax
403 movzbl (%rsi, %rcx), %edx
404 # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
405 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
406 movl (%rcx,%rax,4), %eax
407 movl (%rcx,%rdx,4), %edx
415 * The following cases will be handled by ashr_1
416 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
417 * n(15) n -15 0(15 +(n-15) - n) ashr_1
420 LABEL(ashr_1_sse4_2):
424 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
425 pslldq $15, %xmm2 /* shift first string to align with second */
426 TOLOWER (%xmm1, %xmm2)
427 pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
428 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
430 shr %cl, %edx /* adjust 0xffff for offset */
431 shr %cl, %r9d /* adjust for 16-byte offset */
433 jnz LABEL(less32bytes_sse4_2)/* mismatch or null char seen */
435 UPDATE_STRNCMP_COUNTER
438 mov $16, %rcx /* index for loads*/
439 mov $1, %r9d /* byte position left over from less32bytes case */
441 * Setup %r10 value allows us to detect crossing a page boundary.
442 * When %r10 goes positive we have crossed a page boundary and
443 * need to do a nibble.
446 and $0xfff, %r10 /* offset into 4K page */
447 sub $0x1000, %r10 /* subtract 4K pagesize */
448 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
451 LABEL(loop_ashr_1_use_sse4_2):
453 jg LABEL(nibble_ashr_1_use_sse4_2)
455 LABEL(nibble_ashr_1_use_sse4_2_restart):
456 movdqa (%rdi, %rdx), %xmm0
457 palignr $1, -16(%rdi, %rdx), %xmm0
458 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
459 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
461 movdqa (%rsi,%rdx), %xmm1
462 TOLOWER (%xmm0, %xmm1)
463 pcmpistri $0x1a, %xmm1, %xmm0
465 jbe LABEL(use_sse4_2_exit)
466 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
468 jbe LABEL(strcmp_exitz_sse4_2)
473 jg LABEL(nibble_ashr_1_use_sse4_2)
475 movdqa (%rdi, %rdx), %xmm0
476 palignr $1, -16(%rdi, %rdx), %xmm0
477 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
478 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
480 movdqa (%rsi,%rdx), %xmm1
481 TOLOWER (%xmm0, %xmm1)
482 pcmpistri $0x1a, %xmm1, %xmm0
484 jbe LABEL(use_sse4_2_exit)
485 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
487 jbe LABEL(strcmp_exitz_sse4_2)
490 jmp LABEL(loop_ashr_1_use_sse4_2)
493 LABEL(nibble_ashr_1_use_sse4_2):
495 movdqa -16(%rdi, %rdx), %xmm0
497 pcmpistri $0x3a,%xmm0, %xmm0
498 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
500 jae LABEL(nibble_ashr_use_sse4_2_exit)
503 ja LABEL(nibble_ashr_1_use_sse4_2_restart)
505 jmp LABEL(nibble_ashr_use_sse4_2_exit)
508 * The following cases will be handled by ashr_2
509 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
510 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
513 LABEL(ashr_2_sse4_2):
519 TOLOWER (%xmm1, %xmm2)
526 jnz LABEL(less32bytes_sse4_2)
528 UPDATE_STRNCMP_COUNTER
531 mov $16, %rcx /* index for loads */
532 mov $2, %r9d /* byte position left over from less32bytes case */
534 * Setup %r10 value allows us to detect crossing a page boundary.
535 * When %r10 goes positive we have crossed a page boundary and
536 * need to do a nibble.
539 and $0xfff, %r10 /* offset into 4K page */
540 sub $0x1000, %r10 /* subtract 4K pagesize */
541 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
544 LABEL(loop_ashr_2_use_sse4_2):
546 jg LABEL(nibble_ashr_2_use_sse4_2)
548 LABEL(nibble_ashr_2_use_sse4_2_restart):
549 movdqa (%rdi, %rdx), %xmm0
550 palignr $2, -16(%rdi, %rdx), %xmm0
551 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
552 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
554 movdqa (%rsi,%rdx), %xmm1
555 TOLOWER (%xmm0, %xmm1)
556 pcmpistri $0x1a, %xmm1, %xmm0
558 jbe LABEL(use_sse4_2_exit)
559 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
561 jbe LABEL(strcmp_exitz_sse4_2)
566 jg LABEL(nibble_ashr_2_use_sse4_2)
568 movdqa (%rdi, %rdx), %xmm0
569 palignr $2, -16(%rdi, %rdx), %xmm0
570 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
571 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
573 movdqa (%rsi,%rdx), %xmm1
574 TOLOWER (%xmm0, %xmm1)
575 pcmpistri $0x1a, %xmm1, %xmm0
577 jbe LABEL(use_sse4_2_exit)
578 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
580 jbe LABEL(strcmp_exitz_sse4_2)
583 jmp LABEL(loop_ashr_2_use_sse4_2)
586 LABEL(nibble_ashr_2_use_sse4_2):
588 movdqa -16(%rdi, %rdx), %xmm0
590 pcmpistri $0x3a,%xmm0, %xmm0
591 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
593 jae LABEL(nibble_ashr_use_sse4_2_exit)
596 ja LABEL(nibble_ashr_2_use_sse4_2_restart)
598 jmp LABEL(nibble_ashr_use_sse4_2_exit)
601 * The following cases will be handled by ashr_3
602 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
603 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
606 LABEL(ashr_3_sse4_2):
612 TOLOWER (%xmm1, %xmm2)
619 jnz LABEL(less32bytes_sse4_2)
622 UPDATE_STRNCMP_COUNTER
625 mov $16, %rcx /* index for loads */
626 mov $3, %r9d /* byte position left over from less32bytes case */
628 * Setup %r10 value allows us to detect crossing a page boundary.
629 * When %r10 goes positive we have crossed a page boundary and
630 * need to do a nibble.
633 and $0xfff, %r10 /* offset into 4K page */
634 sub $0x1000, %r10 /* subtract 4K pagesize */
635 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
637 LABEL(loop_ashr_3_use_sse4_2):
639 jg LABEL(nibble_ashr_3_use_sse4_2)
641 LABEL(nibble_ashr_3_use_sse4_2_restart):
642 movdqa (%rdi, %rdx), %xmm0
643 palignr $3, -16(%rdi, %rdx), %xmm0
644 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
645 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
647 movdqa (%rsi,%rdx), %xmm1
648 TOLOWER (%xmm0, %xmm1)
649 pcmpistri $0x1a, %xmm1, %xmm0
651 jbe LABEL(use_sse4_2_exit)
652 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
654 jbe LABEL(strcmp_exitz_sse4_2)
659 jg LABEL(nibble_ashr_3_use_sse4_2)
661 movdqa (%rdi, %rdx), %xmm0
662 palignr $3, -16(%rdi, %rdx), %xmm0
663 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
664 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
666 movdqa (%rsi,%rdx), %xmm1
667 TOLOWER (%xmm0, %xmm1)
668 pcmpistri $0x1a, %xmm1, %xmm0
670 jbe LABEL(use_sse4_2_exit)
671 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
673 jbe LABEL(strcmp_exitz_sse4_2)
676 jmp LABEL(loop_ashr_3_use_sse4_2)
679 LABEL(nibble_ashr_3_use_sse4_2):
681 movdqa -16(%rdi, %rdx), %xmm0
683 pcmpistri $0x3a,%xmm0, %xmm0
684 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
686 jae LABEL(nibble_ashr_use_sse4_2_exit)
689 ja LABEL(nibble_ashr_3_use_sse4_2_restart)
691 jmp LABEL(nibble_ashr_use_sse4_2_exit)
694 * The following cases will be handled by ashr_4
695 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
696 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
699 LABEL(ashr_4_sse4_2):
705 TOLOWER (%xmm1, %xmm2)
712 jnz LABEL(less32bytes_sse4_2)
715 UPDATE_STRNCMP_COUNTER
718 mov $16, %rcx /* index for loads */
719 mov $4, %r9d /* byte position left over from less32bytes case */
721 * Setup %r10 value allows us to detect crossing a page boundary.
722 * When %r10 goes positive we have crossed a page boundary and
723 * need to do a nibble.
726 and $0xfff, %r10 /* offset into 4K page */
727 sub $0x1000, %r10 /* subtract 4K pagesize */
728 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
731 LABEL(loop_ashr_4_use_sse4_2):
733 jg LABEL(nibble_ashr_4_use_sse4_2)
735 LABEL(nibble_ashr_4_use_sse4_2_restart):
736 movdqa (%rdi, %rdx), %xmm0
737 palignr $4, -16(%rdi, %rdx), %xmm0
738 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
739 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
741 movdqa (%rsi,%rdx), %xmm1
742 TOLOWER (%xmm0, %xmm1)
743 pcmpistri $0x1a, %xmm1, %xmm0
745 jbe LABEL(use_sse4_2_exit)
746 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
748 jbe LABEL(strcmp_exitz_sse4_2)
753 jg LABEL(nibble_ashr_4_use_sse4_2)
755 movdqa (%rdi, %rdx), %xmm0
756 palignr $4, -16(%rdi, %rdx), %xmm0
757 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
758 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
760 movdqa (%rsi,%rdx), %xmm1
761 TOLOWER (%xmm0, %xmm1)
762 pcmpistri $0x1a, %xmm1, %xmm0
764 jbe LABEL(use_sse4_2_exit)
765 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
767 jbe LABEL(strcmp_exitz_sse4_2)
770 jmp LABEL(loop_ashr_4_use_sse4_2)
773 LABEL(nibble_ashr_4_use_sse4_2):
775 movdqa -16(%rdi, %rdx), %xmm0
777 pcmpistri $0x3a,%xmm0, %xmm0
778 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
780 jae LABEL(nibble_ashr_use_sse4_2_exit)
783 ja LABEL(nibble_ashr_4_use_sse4_2_restart)
785 jmp LABEL(nibble_ashr_use_sse4_2_exit)
788 * The following cases will be handled by ashr_5
789 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
790 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
793 LABEL(ashr_5_sse4_2):
799 TOLOWER (%xmm1, %xmm2)
806 jnz LABEL(less32bytes_sse4_2)
809 UPDATE_STRNCMP_COUNTER
812 mov $16, %rcx /* index for loads */
813 mov $5, %r9d /* byte position left over from less32bytes case */
815 * Setup %r10 value allows us to detect crossing a page boundary.
816 * When %r10 goes positive we have crossed a page boundary and
817 * need to do a nibble.
820 and $0xfff, %r10 /* offset into 4K page */
821 sub $0x1000, %r10 /* subtract 4K pagesize */
822 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
825 LABEL(loop_ashr_5_use_sse4_2):
827 jg LABEL(nibble_ashr_5_use_sse4_2)
829 LABEL(nibble_ashr_5_use_sse4_2_restart):
830 movdqa (%rdi, %rdx), %xmm0
831 palignr $5, -16(%rdi, %rdx), %xmm0
832 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
833 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
835 movdqa (%rsi,%rdx), %xmm1
836 TOLOWER (%xmm0, %xmm1)
837 pcmpistri $0x1a, %xmm1, %xmm0
839 jbe LABEL(use_sse4_2_exit)
840 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
842 jbe LABEL(strcmp_exitz_sse4_2)
847 jg LABEL(nibble_ashr_5_use_sse4_2)
849 movdqa (%rdi, %rdx), %xmm0
851 palignr $5, -16(%rdi, %rdx), %xmm0
852 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
853 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
855 movdqa (%rsi,%rdx), %xmm1
856 TOLOWER (%xmm0, %xmm1)
857 pcmpistri $0x1a, %xmm1, %xmm0
859 jbe LABEL(use_sse4_2_exit)
860 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
862 jbe LABEL(strcmp_exitz_sse4_2)
865 jmp LABEL(loop_ashr_5_use_sse4_2)
868 LABEL(nibble_ashr_5_use_sse4_2):
870 movdqa -16(%rdi, %rdx), %xmm0
872 pcmpistri $0x3a,%xmm0, %xmm0
873 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
875 jae LABEL(nibble_ashr_use_sse4_2_exit)
878 ja LABEL(nibble_ashr_5_use_sse4_2_restart)
880 jmp LABEL(nibble_ashr_use_sse4_2_exit)
883 * The following cases will be handled by ashr_6
884 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
885 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
888 LABEL(ashr_6_sse4_2):
894 TOLOWER (%xmm1, %xmm2)
901 jnz LABEL(less32bytes_sse4_2)
904 UPDATE_STRNCMP_COUNTER
907 mov $16, %rcx /* index for loads */
908 mov $6, %r9d /* byte position left over from less32bytes case */
910 * Setup %r10 value allows us to detect crossing a page boundary.
911 * When %r10 goes positive we have crossed a page boundary and
912 * need to do a nibble.
915 and $0xfff, %r10 /* offset into 4K page */
916 sub $0x1000, %r10 /* subtract 4K pagesize */
917 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
920 LABEL(loop_ashr_6_use_sse4_2):
922 jg LABEL(nibble_ashr_6_use_sse4_2)
924 LABEL(nibble_ashr_6_use_sse4_2_restart):
925 movdqa (%rdi, %rdx), %xmm0
926 palignr $6, -16(%rdi, %rdx), %xmm0
927 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
928 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
930 movdqa (%rsi,%rdx), %xmm1
931 TOLOWER (%xmm0, %xmm1)
932 pcmpistri $0x1a, %xmm1, %xmm0
934 jbe LABEL(use_sse4_2_exit)
935 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
937 jbe LABEL(strcmp_exitz_sse4_2)
942 jg LABEL(nibble_ashr_6_use_sse4_2)
944 movdqa (%rdi, %rdx), %xmm0
945 palignr $6, -16(%rdi, %rdx), %xmm0
946 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
947 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
949 movdqa (%rsi,%rdx), %xmm1
950 TOLOWER (%xmm0, %xmm1)
951 pcmpistri $0x1a, %xmm1, %xmm0
953 jbe LABEL(use_sse4_2_exit)
954 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
956 jbe LABEL(strcmp_exitz_sse4_2)
959 jmp LABEL(loop_ashr_6_use_sse4_2)
962 LABEL(nibble_ashr_6_use_sse4_2):
964 movdqa -16(%rdi, %rdx), %xmm0
966 pcmpistri $0x3a,%xmm0, %xmm0
967 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
969 jae LABEL(nibble_ashr_use_sse4_2_exit)
972 ja LABEL(nibble_ashr_6_use_sse4_2_restart)
974 jmp LABEL(nibble_ashr_use_sse4_2_exit)
977 * The following cases will be handled by ashr_7
978 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
979 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
982 LABEL(ashr_7_sse4_2):
988 TOLOWER (%xmm1, %xmm2)
995 jnz LABEL(less32bytes_sse4_2)
998 UPDATE_STRNCMP_COUNTER
1001 mov $16, %rcx /* index for loads */
1002 mov $7, %r9d /* byte position left over from less32bytes case */
1004 * Setup %r10 value allows us to detect crossing a page boundary.
1005 * When %r10 goes positive we have crossed a page boundary and
1006 * need to do a nibble.
1009 and $0xfff, %r10 /* offset into 4K page */
1010 sub $0x1000, %r10 /* subtract 4K pagesize */
1011 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1014 LABEL(loop_ashr_7_use_sse4_2):
1016 jg LABEL(nibble_ashr_7_use_sse4_2)
1018 LABEL(nibble_ashr_7_use_sse4_2_restart):
1019 movdqa (%rdi, %rdx), %xmm0
1020 palignr $7, -16(%rdi, %rdx), %xmm0
1021 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1022 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1024 movdqa (%rsi,%rdx), %xmm1
1025 TOLOWER (%xmm0, %xmm1)
1026 pcmpistri $0x1a, %xmm1, %xmm0
1028 jbe LABEL(use_sse4_2_exit)
1029 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1031 jbe LABEL(strcmp_exitz_sse4_2)
1036 jg LABEL(nibble_ashr_7_use_sse4_2)
1038 movdqa (%rdi, %rdx), %xmm0
1039 palignr $7, -16(%rdi, %rdx), %xmm0
1040 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1041 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1043 movdqa (%rsi,%rdx), %xmm1
1044 TOLOWER (%xmm0, %xmm1)
1045 pcmpistri $0x1a, %xmm1, %xmm0
1047 jbe LABEL(use_sse4_2_exit)
1048 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1050 jbe LABEL(strcmp_exitz_sse4_2)
1053 jmp LABEL(loop_ashr_7_use_sse4_2)
1056 LABEL(nibble_ashr_7_use_sse4_2):
1058 movdqa -16(%rdi, %rdx), %xmm0
1060 pcmpistri $0x3a,%xmm0, %xmm0
1061 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1063 jae LABEL(nibble_ashr_use_sse4_2_exit)
1066 ja LABEL(nibble_ashr_7_use_sse4_2_restart)
1068 jmp LABEL(nibble_ashr_use_sse4_2_exit)
1071 * The following cases will be handled by ashr_8
1072 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1073 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
1076 LABEL(ashr_8_sse4_2):
1078 movdqa (%rdi), %xmm2
1079 movdqa (%rsi), %xmm1
1080 pcmpeqb %xmm1, %xmm0
1082 TOLOWER (%xmm1, %xmm2)
1083 pcmpeqb %xmm1, %xmm2
1085 pmovmskb %xmm2, %r9d
1089 jnz LABEL(less32bytes_sse4_2)
1090 movdqa (%rdi), %xmm3
1092 UPDATE_STRNCMP_COUNTER
1095 mov $16, %rcx /* index for loads */
1096 mov $8, %r9d /* byte position left over from less32bytes case */
1098 * Setup %r10 value allows us to detect crossing a page boundary.
1099 * When %r10 goes positive we have crossed a page boundary and
1100 * need to do a nibble.
1103 and $0xfff, %r10 /* offset into 4K page */
1104 sub $0x1000, %r10 /* subtract 4K pagesize */
1105 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1108 LABEL(loop_ashr_8_use_sse4_2):
1110 jg LABEL(nibble_ashr_8_use_sse4_2)
1112 LABEL(nibble_ashr_8_use_sse4_2_restart):
1113 movdqa (%rdi, %rdx), %xmm0
1114 palignr $8, -16(%rdi, %rdx), %xmm0
1115 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1116 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1118 movdqa (%rsi,%rdx), %xmm1
1119 TOLOWER (%xmm0, %xmm1)
1120 pcmpistri $0x1a, %xmm1, %xmm0
1122 jbe LABEL(use_sse4_2_exit)
1123 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1125 jbe LABEL(strcmp_exitz_sse4_2)
1130 jg LABEL(nibble_ashr_8_use_sse4_2)
1132 movdqa (%rdi, %rdx), %xmm0
1133 palignr $8, -16(%rdi, %rdx), %xmm0
1134 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1135 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1137 movdqa (%rsi,%rdx), %xmm1
1138 TOLOWER (%xmm0, %xmm1)
1139 pcmpistri $0x1a, %xmm1, %xmm0
1141 jbe LABEL(use_sse4_2_exit)
1142 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1144 jbe LABEL(strcmp_exitz_sse4_2)
1147 jmp LABEL(loop_ashr_8_use_sse4_2)
1150 LABEL(nibble_ashr_8_use_sse4_2):
1152 movdqa -16(%rdi, %rdx), %xmm0
1154 pcmpistri $0x3a,%xmm0, %xmm0
1155 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1157 jae LABEL(nibble_ashr_use_sse4_2_exit)
1160 ja LABEL(nibble_ashr_8_use_sse4_2_restart)
1162 jmp LABEL(nibble_ashr_use_sse4_2_exit)
1165 * The following cases will be handled by ashr_9
1166 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1167 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
1170 LABEL(ashr_9_sse4_2):
1172 movdqa (%rdi), %xmm2
1173 movdqa (%rsi), %xmm1
1174 pcmpeqb %xmm1, %xmm0
1176 TOLOWER (%xmm1, %xmm2)
1177 pcmpeqb %xmm1, %xmm2
1179 pmovmskb %xmm2, %r9d
1183 jnz LABEL(less32bytes_sse4_2)
1184 movdqa (%rdi), %xmm3
1186 UPDATE_STRNCMP_COUNTER
1189 mov $16, %rcx /* index for loads */
1190 mov $9, %r9d /* byte position left over from less32bytes case */
1192 * Setup %r10 value allows us to detect crossing a page boundary.
1193 * When %r10 goes positive we have crossed a page boundary and
1194 * need to do a nibble.
1197 and $0xfff, %r10 /* offset into 4K page */
1198 sub $0x1000, %r10 /* subtract 4K pagesize */
1199 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1202 LABEL(loop_ashr_9_use_sse4_2):
1204 jg LABEL(nibble_ashr_9_use_sse4_2)
1206 LABEL(nibble_ashr_9_use_sse4_2_restart):
1207 movdqa (%rdi, %rdx), %xmm0
1209 palignr $9, -16(%rdi, %rdx), %xmm0
1210 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1211 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1213 movdqa (%rsi,%rdx), %xmm1
1214 TOLOWER (%xmm0, %xmm1)
1215 pcmpistri $0x1a, %xmm1, %xmm0
1217 jbe LABEL(use_sse4_2_exit)
1218 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1220 jbe LABEL(strcmp_exitz_sse4_2)
1225 jg LABEL(nibble_ashr_9_use_sse4_2)
1227 movdqa (%rdi, %rdx), %xmm0
1228 palignr $9, -16(%rdi, %rdx), %xmm0
1229 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1230 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1232 movdqa (%rsi,%rdx), %xmm1
1233 TOLOWER (%xmm0, %xmm1)
1234 pcmpistri $0x1a, %xmm1, %xmm0
1236 jbe LABEL(use_sse4_2_exit)
1237 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1239 jbe LABEL(strcmp_exitz_sse4_2)
1242 jmp LABEL(loop_ashr_9_use_sse4_2)
1245 LABEL(nibble_ashr_9_use_sse4_2):
1247 movdqa -16(%rdi, %rdx), %xmm0
1249 pcmpistri $0x3a,%xmm0, %xmm0
1250 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1252 jae LABEL(nibble_ashr_use_sse4_2_exit)
1255 ja LABEL(nibble_ashr_9_use_sse4_2_restart)
1257 jmp LABEL(nibble_ashr_use_sse4_2_exit)
1260 * The following cases will be handled by ashr_10
1261 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1262 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
1265 LABEL(ashr_10_sse4_2):
1267 movdqa (%rdi), %xmm2
1268 movdqa (%rsi), %xmm1
1269 pcmpeqb %xmm1, %xmm0
1271 TOLOWER (%xmm1, %xmm2)
1272 pcmpeqb %xmm1, %xmm2
1274 pmovmskb %xmm2, %r9d
1278 jnz LABEL(less32bytes_sse4_2)
1279 movdqa (%rdi), %xmm3
1281 UPDATE_STRNCMP_COUNTER
1284 mov $16, %rcx /* index for loads */
1285 mov $10, %r9d /* byte position left over from less32bytes case */
1287 * Setup %r10 value allows us to detect crossing a page boundary.
1288 * When %r10 goes positive we have crossed a page boundary and
1289 * need to do a nibble.
1292 and $0xfff, %r10 /* offset into 4K page */
1293 sub $0x1000, %r10 /* subtract 4K pagesize */
1294 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1297 LABEL(loop_ashr_10_use_sse4_2):
1299 jg LABEL(nibble_ashr_10_use_sse4_2)
1301 LABEL(nibble_ashr_10_use_sse4_2_restart):
1302 movdqa (%rdi, %rdx), %xmm0
1303 palignr $10, -16(%rdi, %rdx), %xmm0
1304 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1305 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1307 movdqa (%rsi,%rdx), %xmm1
1308 TOLOWER (%xmm0, %xmm1)
1309 pcmpistri $0x1a, %xmm1, %xmm0
1311 jbe LABEL(use_sse4_2_exit)
1312 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1314 jbe LABEL(strcmp_exitz_sse4_2)
1319 jg LABEL(nibble_ashr_10_use_sse4_2)
1321 movdqa (%rdi, %rdx), %xmm0
1322 palignr $10, -16(%rdi, %rdx), %xmm0
1323 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1324 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1326 movdqa (%rsi,%rdx), %xmm1
1327 TOLOWER (%xmm0, %xmm1)
1328 pcmpistri $0x1a, %xmm1, %xmm0
1330 jbe LABEL(use_sse4_2_exit)
1331 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1333 jbe LABEL(strcmp_exitz_sse4_2)
1336 jmp LABEL(loop_ashr_10_use_sse4_2)
1339 LABEL(nibble_ashr_10_use_sse4_2):
1341 movdqa -16(%rdi, %rdx), %xmm0
1343 pcmpistri $0x3a,%xmm0, %xmm0
1344 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1346 jae LABEL(nibble_ashr_use_sse4_2_exit)
1349 ja LABEL(nibble_ashr_10_use_sse4_2_restart)
1351 jmp LABEL(nibble_ashr_use_sse4_2_exit)
1354 * The following cases will be handled by ashr_11
1355 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1356 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
1359 LABEL(ashr_11_sse4_2):
1361 movdqa (%rdi), %xmm2
1362 movdqa (%rsi), %xmm1
1363 pcmpeqb %xmm1, %xmm0
1365 TOLOWER (%xmm1, %xmm2)
1366 pcmpeqb %xmm1, %xmm2
1368 pmovmskb %xmm2, %r9d
1372 jnz LABEL(less32bytes_sse4_2)
1373 movdqa (%rdi), %xmm3
1375 UPDATE_STRNCMP_COUNTER
1378 mov $16, %rcx /* index for loads */
1379 mov $11, %r9d /* byte position left over from less32bytes case */
1381 * Setup %r10 value allows us to detect crossing a page boundary.
1382 * When %r10 goes positive we have crossed a page boundary and
1383 * need to do a nibble.
1386 and $0xfff, %r10 /* offset into 4K page */
1387 sub $0x1000, %r10 /* subtract 4K pagesize */
1388 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1391 LABEL(loop_ashr_11_use_sse4_2):
1393 jg LABEL(nibble_ashr_11_use_sse4_2)
1395 LABEL(nibble_ashr_11_use_sse4_2_restart):
1396 movdqa (%rdi, %rdx), %xmm0
1397 palignr $11, -16(%rdi, %rdx), %xmm0
1398 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1399 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1401 movdqa (%rsi,%rdx), %xmm1
1402 TOLOWER (%xmm0, %xmm1)
1403 pcmpistri $0x1a, %xmm1, %xmm0
1405 jbe LABEL(use_sse4_2_exit)
1406 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1408 jbe LABEL(strcmp_exitz_sse4_2)
1413 jg LABEL(nibble_ashr_11_use_sse4_2)
1415 movdqa (%rdi, %rdx), %xmm0
1416 palignr $11, -16(%rdi, %rdx), %xmm0
1417 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1418 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1420 movdqa (%rsi,%rdx), %xmm1
1421 TOLOWER (%xmm0, %xmm1)
1422 pcmpistri $0x1a, %xmm1, %xmm0
1424 jbe LABEL(use_sse4_2_exit)
1425 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1427 jbe LABEL(strcmp_exitz_sse4_2)
1430 jmp LABEL(loop_ashr_11_use_sse4_2)
1433 LABEL(nibble_ashr_11_use_sse4_2):
1435 movdqa -16(%rdi, %rdx), %xmm0
1437 pcmpistri $0x3a,%xmm0, %xmm0
1438 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1440 jae LABEL(nibble_ashr_use_sse4_2_exit)
1443 ja LABEL(nibble_ashr_11_use_sse4_2_restart)
1445 jmp LABEL(nibble_ashr_use_sse4_2_exit)
1448 * The following cases will be handled by ashr_12
1449 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1450 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
1453 LABEL(ashr_12_sse4_2):
1455 movdqa (%rdi), %xmm2
1456 movdqa (%rsi), %xmm1
1457 pcmpeqb %xmm1, %xmm0
1459 TOLOWER (%xmm1, %xmm2)
1460 pcmpeqb %xmm1, %xmm2
1462 pmovmskb %xmm2, %r9d
1466 jnz LABEL(less32bytes_sse4_2)
1467 movdqa (%rdi), %xmm3
1469 UPDATE_STRNCMP_COUNTER
1472 mov $16, %rcx /* index for loads */
1473 mov $12, %r9d /* byte position left over from less32bytes case */
1475 * Setup %r10 value allows us to detect crossing a page boundary.
1476 * When %r10 goes positive we have crossed a page boundary and
1477 * need to do a nibble.
1480 and $0xfff, %r10 /* offset into 4K page */
1481 sub $0x1000, %r10 /* subtract 4K pagesize */
1482 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1485 LABEL(loop_ashr_12_use_sse4_2):
1487 jg LABEL(nibble_ashr_12_use_sse4_2)
1489 LABEL(nibble_ashr_12_use_sse4_2_restart):
1490 movdqa (%rdi, %rdx), %xmm0
1491 palignr $12, -16(%rdi, %rdx), %xmm0
1492 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1493 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1495 movdqa (%rsi,%rdx), %xmm1
1496 TOLOWER (%xmm0, %xmm1)
1497 pcmpistri $0x1a, %xmm1, %xmm0
1499 jbe LABEL(use_sse4_2_exit)
1500 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1502 jbe LABEL(strcmp_exitz_sse4_2)
1507 jg LABEL(nibble_ashr_12_use_sse4_2)
1509 movdqa (%rdi, %rdx), %xmm0
1510 palignr $12, -16(%rdi, %rdx), %xmm0
1511 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1512 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1514 movdqa (%rsi,%rdx), %xmm1
1515 TOLOWER (%xmm0, %xmm1)
1516 pcmpistri $0x1a, %xmm1, %xmm0
1518 jbe LABEL(use_sse4_2_exit)
1519 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1521 jbe LABEL(strcmp_exitz_sse4_2)
1524 jmp LABEL(loop_ashr_12_use_sse4_2)
1527 LABEL(nibble_ashr_12_use_sse4_2):
1529 movdqa -16(%rdi, %rdx), %xmm0
1531 pcmpistri $0x3a,%xmm0, %xmm0
1532 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1534 jae LABEL(nibble_ashr_use_sse4_2_exit)
1537 ja LABEL(nibble_ashr_12_use_sse4_2_restart)
1539 jmp LABEL(nibble_ashr_use_sse4_2_exit)
1542 * The following cases will be handled by ashr_13
1543 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1544 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
1547 LABEL(ashr_13_sse4_2):
1549 movdqa (%rdi), %xmm2
1550 movdqa (%rsi), %xmm1
1551 pcmpeqb %xmm1, %xmm0
1553 TOLOWER (%xmm1, %xmm2)
1554 pcmpeqb %xmm1, %xmm2
1556 pmovmskb %xmm2, %r9d
1560 jnz LABEL(less32bytes_sse4_2)
1561 movdqa (%rdi), %xmm3
1563 UPDATE_STRNCMP_COUNTER
1566 mov $16, %rcx /* index for loads */
1567 mov $13, %r9d /* byte position left over from less32bytes case */
1569 * Setup %r10 value allows us to detect crossing a page boundary.
1570 * When %r10 goes positive we have crossed a page boundary and
1571 * need to do a nibble.
1574 and $0xfff, %r10 /* offset into 4K page */
1575 sub $0x1000, %r10 /* subtract 4K pagesize */
1577 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1580 LABEL(loop_ashr_13_use_sse4_2):
1582 jg LABEL(nibble_ashr_13_use_sse4_2)
1584 LABEL(nibble_ashr_13_use_sse4_2_restart):
1585 movdqa (%rdi, %rdx), %xmm0
1586 palignr $13, -16(%rdi, %rdx), %xmm0
1587 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1588 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1590 movdqa (%rsi,%rdx), %xmm1
1591 TOLOWER (%xmm0, %xmm1)
1592 pcmpistri $0x1a, %xmm1, %xmm0
1594 jbe LABEL(use_sse4_2_exit)
1595 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1597 jbe LABEL(strcmp_exitz_sse4_2)
1602 jg LABEL(nibble_ashr_13_use_sse4_2)
1604 movdqa (%rdi, %rdx), %xmm0
1605 palignr $13, -16(%rdi, %rdx), %xmm0
1606 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1607 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1609 movdqa (%rsi,%rdx), %xmm1
1610 TOLOWER (%xmm0, %xmm1)
1611 pcmpistri $0x1a, %xmm1, %xmm0
1613 jbe LABEL(use_sse4_2_exit)
1614 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1616 jbe LABEL(strcmp_exitz_sse4_2)
1619 jmp LABEL(loop_ashr_13_use_sse4_2)
1622 LABEL(nibble_ashr_13_use_sse4_2):
1624 movdqa -16(%rdi, %rdx), %xmm0
1626 pcmpistri $0x3a,%xmm0, %xmm0
1627 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1629 jae LABEL(nibble_ashr_use_sse4_2_exit)
1632 ja LABEL(nibble_ashr_13_use_sse4_2_restart)
1634 jmp LABEL(nibble_ashr_use_sse4_2_exit)
1637 * The following cases will be handled by ashr_14
1638 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1639 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
1642 LABEL(ashr_14_sse4_2):
1644 movdqa (%rdi), %xmm2
1645 movdqa (%rsi), %xmm1
1646 pcmpeqb %xmm1, %xmm0
1648 TOLOWER (%xmm1, %xmm2)
1649 pcmpeqb %xmm1, %xmm2
1651 pmovmskb %xmm2, %r9d
1655 jnz LABEL(less32bytes_sse4_2)
1656 movdqa (%rdi), %xmm3
1658 UPDATE_STRNCMP_COUNTER
1661 mov $16, %rcx /* index for loads */
1662 mov $14, %r9d /* byte position left over from less32bytes case */
1664 * Setup %r10 value allows us to detect crossing a page boundary.
1665 * When %r10 goes positive we have crossed a page boundary and
1666 * need to do a nibble.
1669 and $0xfff, %r10 /* offset into 4K page */
1670 sub $0x1000, %r10 /* subtract 4K pagesize */
1672 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1675 LABEL(loop_ashr_14_use_sse4_2):
1677 jg LABEL(nibble_ashr_14_use_sse4_2)
1679 LABEL(nibble_ashr_14_use_sse4_2_restart):
1680 movdqa (%rdi, %rdx), %xmm0
1681 palignr $14, -16(%rdi, %rdx), %xmm0
1682 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1683 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1685 movdqa (%rsi,%rdx), %xmm1
1686 TOLOWER (%xmm0, %xmm1)
1687 pcmpistri $0x1a, %xmm1, %xmm0
1689 jbe LABEL(use_sse4_2_exit)
1690 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1692 jbe LABEL(strcmp_exitz_sse4_2)
1697 jg LABEL(nibble_ashr_14_use_sse4_2)
1699 movdqa (%rdi, %rdx), %xmm0
1700 palignr $14, -16(%rdi, %rdx), %xmm0
1701 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1702 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1704 movdqa (%rsi,%rdx), %xmm1
1705 TOLOWER (%xmm0, %xmm1)
1706 pcmpistri $0x1a, %xmm1, %xmm0
1708 jbe LABEL(use_sse4_2_exit)
1709 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1711 jbe LABEL(strcmp_exitz_sse4_2)
1714 jmp LABEL(loop_ashr_14_use_sse4_2)
1717 LABEL(nibble_ashr_14_use_sse4_2):
1719 movdqa -16(%rdi, %rdx), %xmm0
1721 pcmpistri $0x3a,%xmm0, %xmm0
1722 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1724 jae LABEL(nibble_ashr_use_sse4_2_exit)
1727 ja LABEL(nibble_ashr_14_use_sse4_2_restart)
1729 jmp LABEL(nibble_ashr_use_sse4_2_exit)
1732 * The following cases will be handled by ashr_15
1733 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1734 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
1737 LABEL(ashr_15_sse4_2):
1739 movdqa (%rdi), %xmm2
1740 movdqa (%rsi), %xmm1
1741 pcmpeqb %xmm1, %xmm0
1743 TOLOWER (%xmm1, %xmm2)
1744 pcmpeqb %xmm1, %xmm2
1746 pmovmskb %xmm2, %r9d
1750 jnz LABEL(less32bytes_sse4_2)
1752 movdqa (%rdi), %xmm3
1754 UPDATE_STRNCMP_COUNTER
1757 mov $16, %rcx /* index for loads */
1758 mov $15, %r9d /* byte position left over from less32bytes case */
1760 * Setup %r10 value allows us to detect crossing a page boundary.
1761 * When %r10 goes positive we have crossed a page boundary and
1762 * need to do a nibble.
1765 and $0xfff, %r10 /* offset into 4K page */
1767 sub $0x1000, %r10 /* subtract 4K pagesize */
1769 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1772 LABEL(loop_ashr_15_use_sse4_2):
1774 jg LABEL(nibble_ashr_15_use_sse4_2)
1776 LABEL(nibble_ashr_15_use_sse4_2_restart):
1777 movdqa (%rdi, %rdx), %xmm0
1778 palignr $15, -16(%rdi, %rdx), %xmm0
1779 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1780 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1782 movdqa (%rsi,%rdx), %xmm1
1783 TOLOWER (%xmm0, %xmm1)
1784 pcmpistri $0x1a, %xmm1, %xmm0
1786 jbe LABEL(use_sse4_2_exit)
1787 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1789 jbe LABEL(strcmp_exitz_sse4_2)
1794 jg LABEL(nibble_ashr_15_use_sse4_2)
1796 movdqa (%rdi, %rdx), %xmm0
1797 palignr $15, -16(%rdi, %rdx), %xmm0
1798 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1799 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1801 movdqa (%rsi,%rdx), %xmm1
1802 TOLOWER (%xmm0, %xmm1)
1803 pcmpistri $0x1a, %xmm1, %xmm0
1805 jbe LABEL(use_sse4_2_exit)
1806 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1808 jbe LABEL(strcmp_exitz_sse4_2)
1811 jmp LABEL(loop_ashr_15_use_sse4_2)
1814 LABEL(nibble_ashr_15_use_sse4_2):
1816 movdqa -16(%rdi, %rdx), %xmm0
1818 pcmpistri $0x3a,%xmm0, %xmm0
1819 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1821 jae LABEL(nibble_ashr_use_sse4_2_exit)
1824 ja LABEL(nibble_ashr_15_use_sse4_2_restart)
1826 LABEL(nibble_ashr_use_sse4_2_exit):
1827 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1828 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1830 movdqa (%rsi,%rdx), %xmm1
1831 TOLOWER (%xmm0, %xmm1)
1832 pcmpistri $0x1a, %xmm1, %xmm0
1835 LABEL(use_sse4_2_exit):
1836 jnc LABEL(strcmp_exitz_sse4_2)
1837 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1839 jbe LABEL(strcmp_exitz_sse4_2)
1842 lea -16(%rdi, %r9), %rdi
1843 movzbl (%rdi, %rdx), %eax
1844 movzbl (%rsi, %rdx), %edx
1846 jz LABEL(use_sse4_2_ret_sse4_2)
1848 LABEL(use_sse4_2_ret_sse4_2):
1849 # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1850 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
1851 movl (%rcx,%rdx,4), %edx
1852 movl (%rcx,%rax,4), %eax
1858 LABEL(less32bytes_sse4_2):
1859 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
1860 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
1862 jz LABEL(ret_sse4_2)
1863 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
1867 LABEL(less16bytes_sse4_2):
1868 bsf %rdx, %rdx /* find and store bit index in %rdx */
1870 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1872 jbe LABEL(strcmp_exitz_sse4_2)
1874 movzbl (%rsi, %rdx), %ecx
1875 movzbl (%rdi, %rdx), %eax
1877 # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1878 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1879 movl (%rdx,%rcx,4), %ecx
1880 movl (%rdx,%rax,4), %eax
1886 LABEL(strcmp_exitz_sse4_2):
1891 // XXX Same as code above
1892 LABEL(Byte0_sse4_2):
1896 # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1897 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1898 movl (%rdx,%rcx,4), %ecx
1899 movl (%rdx,%rax,4), %eax
1905 .size STRCMP_SSE42, .-STRCMP_SSE42
1912 /* Put all SSE 4.2 functions together. */
1913 .section .rodata.sse4.2,"a",@progbits
1915 LABEL(unaligned_table_sse4_2):
1916 .int LABEL(ashr_1_sse4_2) - LABEL(unaligned_table_sse4_2)
1917 .int LABEL(ashr_2_sse4_2) - LABEL(unaligned_table_sse4_2)
1918 .int LABEL(ashr_3_sse4_2) - LABEL(unaligned_table_sse4_2)
1919 .int LABEL(ashr_4_sse4_2) - LABEL(unaligned_table_sse4_2)
1920 .int LABEL(ashr_5_sse4_2) - LABEL(unaligned_table_sse4_2)
1921 .int LABEL(ashr_6_sse4_2) - LABEL(unaligned_table_sse4_2)
1922 .int LABEL(ashr_7_sse4_2) - LABEL(unaligned_table_sse4_2)
1923 .int LABEL(ashr_8_sse4_2) - LABEL(unaligned_table_sse4_2)
1924 .int LABEL(ashr_9_sse4_2) - LABEL(unaligned_table_sse4_2)
1925 .int LABEL(ashr_10_sse4_2) - LABEL(unaligned_table_sse4_2)
1926 .int LABEL(ashr_11_sse4_2) - LABEL(unaligned_table_sse4_2)
1927 .int LABEL(ashr_12_sse4_2) - LABEL(unaligned_table_sse4_2)
1928 .int LABEL(ashr_13_sse4_2) - LABEL(unaligned_table_sse4_2)
1929 .int LABEL(ashr_14_sse4_2) - LABEL(unaligned_table_sse4_2)
1930 .int LABEL(ashr_15_sse4_2) - LABEL(unaligned_table_sse4_2)
1931 .int LABEL(ashr_0_sse4_2) - LABEL(unaligned_table_sse4_2)
1935 # define ENTRY(name) \
1936 .type STRCMP_SSE2, @function; \
1938 STRCMP_SSE2: cfi_startproc; \
1941 # define END(name) \
1942 cfi_endproc; .size STRCMP_SSE2, .-STRCMP_SSE2
1944 # ifdef USE_AS_STRCASECMP_L
1945 # define ENTRY2(name) \
1946 .type __strcasecmp_sse2, @function; \
1948 __strcasecmp_sse2: cfi_startproc; \
1950 # define END2(name) \
1951 cfi_endproc; .size __strcasecmp_sse2, .-__strcasecmp_sse2
1954 # ifdef USE_AS_STRNCASECMP_L
1955 # define ENTRY2(name) \
1956 .type __strncasecmp_sse2, @function; \
1958 __strncasecmp_sse2: cfi_startproc; \
1960 # define END2(name) \
1961 cfi_endproc; .size __strncasecmp_sse2, .-__strncasecmp_sse2
1964 # undef libc_hidden_builtin_def
1965 /* It doesn't make sense to send libc-internal strcmp calls through a PLT.
1966 The speedup we get from using SSE4.2 instruction is likely eaten away
1967 by the indirect call in the PLT. */
1968 # define libc_hidden_builtin_def(name) \
1969 .globl __GI_STRCMP; __GI_STRCMP = STRCMP_SSE2
1972 #include "../strcmp.S"