1 /* Optimized strcasestr implementation for PowerPC64/POWER8.
2 Copyright (C) 2016-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
20 #include <locale-defines.h>
22 /* Char * [r3] strcasestr (char *s [r3], char * pat[r4]) */
24 /* The performance gain is obtained by comparing 16 bytes. */
26 /* When the first char of r4 is hit ITERATIONS times in r3
27 fallback to default. */
31 /* For builds without IFUNC support, local calls should be made to internal
32 GLIBC symbol (created by libc_hidden_builtin_def). */
34 # define STRLEN __GI_strlen
36 # define STRLEN strlen
41 /* For builds without IFUNC support, local calls should be made to internal
42 GLIBC symbol (created by libc_hidden_builtin_def). */
44 # define STRNLEN __GI_strnlen
46 # define STRNLEN __strnlen
52 # define STRCHR __GI_strchr
54 # define STRCHR strchr
58 /* Convert 16 bytes of v4 and reg to lowercase and compare. */
59 #define TOLOWER(reg) \
60 vcmpgtub v6, v4, v1; \
61 vcmpgtub v7, v2, v4; \
65 vcmpgtub v6, reg, v1; \
66 vcmpgtub v7, v2, reg; \
70 vcmpequb. v6, reg, v4;
72 /* TODO: change these to the actual instructions when the minimum required
73 binutils allows it. */
75 #define VCLZD_V8_v7 vclzd v8, v7;
77 #define VCLZD_V8_v7 .long 0x11003fc2
80 #define FRAMESIZE (FRAME_MIN_SIZE+48)
81 /* TODO: change this to .machine power8 when the minimum required binutils
84 EALIGN (__strcasestr, 4, 0)
86 mflr r0 /* Load link register LR to r0. */
87 std r31, -8(r1) /* Save callers register r31. */
88 std r30, -16(r1) /* Save callers register r30. */
89 std r29, -24(r1) /* Save callers register r29. */
90 std r28, -32(r1) /* Save callers register r28. */
91 std r27, -40(r1) /* Save callers register r27. */
92 std r0, 16(r1) /* Store the link register. */
99 stdu r1, -FRAMESIZE(r1) /* Create the stack frame. */
100 cfi_adjust_cfa_offset(FRAMESIZE)
104 cmpdi cr7, r3, 0 /* Input validation. */
111 /* Load first byte from r4 and check if its null. */
116 ld r10, __libc_tsd_LOCALE@got@tprel(r2)
117 add r9, r10, __libc_tsd_LOCALE@tls
119 ld r9, LOCALE_CTYPE_TOUPPER(r9)
120 sldi r10, r6, 2 /* Convert to upper case. */
123 ld r10, __libc_tsd_LOCALE@got@tprel(r2)
124 add r11, r10, __libc_tsd_LOCALE@tls
126 ld r11, LOCALE_CTYPE_TOLOWER(r11)
127 sldi r10, r6, 2 /* Convert to lower case. */
130 /* Check if the first char is present. */
143 beq cr7, L(skipcheck)
146 /* Move r3 to the first occurence. */
154 /* Reg r27 is used to count the number of iterations. */
156 /* If first char of search str is not present. */
160 /* Find the length of pattern. */
165 cmpdi cr7, r3, 0 /* If search str is null. */
174 cmpd cr7, r3, r31 /* If len(r3) < len(r4). */
179 /* Locales not matching ASCII for single bytes. */
180 ld r10, __libc_tsd_LOCALE@got@tprel(r2)
181 add r9, r10, __libc_tsd_LOCALE@tls
184 addi r7, r7, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES
187 beq cr7, L(bytebybyte)
189 /* If len(r4) < 16 handle byte by byte. */
190 /* For shorter strings we will not use vector registers. */
192 blt cr7, L(bytebybyte)
194 /* Comparison values used for TOLOWER. */
195 /* Load v1 = 64('A' - 1), v2 = 91('Z' + 1), v3 = 32 in each byte. */
208 1. Load 16 bytes from r3 and r4
209 2. Check if there is null, If yes, proceed byte by byte path.
210 3. Else,Convert both to lowercase and compare.
211 4. If they are same proceed to 1.
212 5. If they dont match, find if first char of r4 is present in the
213 loaded 16 byte of r3.
214 6. If yes, move position, load next 16 bytes of r3 and proceed to 2.
217 mr r8, r3 /* Save r3 for future use. */
218 mr r4, r30 /* Restore r4. */
220 lvx v5, 0, r4 /* Load 16 bytes from r4. */
223 /* If r4 is unaligned, load another 16 bytes. */
224 #ifdef __LITTLE_ENDIAN__
231 #ifdef __LITTLE_ENDIAN__
238 vcmpequb. v7, v0, v4 /* Check for null. */
247 #ifdef __LITTLE_ENDIAN__
253 /* If r3 is unaligned, load another 16 bytes. */
255 #ifdef __LITTLE_ENDIAN__
256 vperm v4, v10, v4, v7
258 vperm v4, v4, v10, v7
261 vcmpequb. v6, v0, v5 /* Check for null. */
273 /* Convert both v3 and v4 to lower. */
275 /* If both are same, branch to match. */
277 /* Find if the first char is present in next 15 bytes. */
278 #ifdef __LITTLE_ENDIAN__
280 vsldoi v7, v0, v4, 15
288 /* Shift r3 by 16 bytes and proceed. */
291 #ifdef __LITTLE_ENDIAN__
297 /* Shift r3 by 8 bytes and proceed. */
303 /* There is a match of 16 bytes, check next bytes. */
311 /* Load next 16 bytes of r3 and r4 and compare. */
315 /* Handle unaligned case. */
323 #ifdef __LITTLE_ENDIAN__
329 /* If r4 is unaligned, load another 16 bytes. */
331 #ifdef __LITTLE_ENDIAN__
332 vperm v11, v9, v6, v7
334 vperm v11, v6, v9, v7
342 vcmpequb. v7, v0, v11
350 beq cr7, L(nextload1)
351 /* Handle unaligned case. */
359 #ifdef __LITTLE_ENDIAN__
365 /* If r3 is unaligned, load another 16 bytes. */
367 #ifdef __LITTLE_ENDIAN__
368 vperm v4, v10, v4, v7
370 vperm v4, v4, v10, v7
384 /* Convert both v3 and v4 to lower. */
386 /* If both are same, branch to secondmatch. */
387 blt cr6, L(secondmatch)
388 /* Continue the search. */
393 ld r10, __libc_tsd_LOCALE@got@tprel(r2)
394 add r11, r10, __libc_tsd_LOCALE@tls
396 ld r11, LOCALE_CTYPE_TOLOWER(r11)
398 lbz r5, 0(r3) /* Load byte from r3. */
399 lbz r6, 0(r4) /* Load next byte from r4. */
400 cmpdi cr7, r6, 0 /* Is it null? */
402 cmpdi cr7, r5, 0 /* Is it null? */
403 beq cr7, L(retnull) /* If yes, return. */
405 addi r4, r4, 1 /* Increment r4. */
406 sldi r10, r5, 2 /* Convert to lower case. */
408 sldi r7, r6, 2 /* Convert to lower case. */
410 cmpw cr7, r7, r10 /* Compare with byte from r4. */
425 /* When our iterations exceed ITERATIONS,fall back to default. */
427 cmpdi cr7, r27, ITERATIONS
429 mr r4, r30 /* Restore r4. */
432 /* Handling byte by byte. */
437 cmpdi cr7, r27, ITERATIONS
441 /* Check if the first char is present. */
454 beq cr7, L(skipcheck1)
457 /* Move r3 to first occurence. */
465 ld r10, __libc_tsd_LOCALE@got@tprel(r2)
466 add r11, r10, __libc_tsd_LOCALE@tls
468 ld r11, LOCALE_CTYPE_TOLOWER(r11)
469 mr r4, r30 /* Restore r4. */
470 mr r8, r3 /* Save r3. */
475 lbz r5, 0(r3) /* Load byte from r3. */
476 addi r4, r4, 1 /* Increment r4. */
477 lbz r6, 0(r4) /* Load next byte from r4. */
478 cmpdi cr7, r6, 0 /* Is it null? */
480 cmpdi cr7, r5, 0 /* Is it null? */
481 beq cr7, L(retnull) /* If yes, return. */
482 sldi r10, r5, 2 /* Convert to lower case. */
484 sldi r7, r6, 2 /* Convert to lower case. */
486 cmpw cr7, r7, r10 /* Compare with byte from r4. */
490 /* Handling return values. */
493 subf r3, r31, r3 /* Reduce r31 (len of r4) from r3. */
498 mr r3, r29 /* Return point of match. */
503 li r3, 0 /* Substring was not found. */
514 addi r1, r1, FRAMESIZE /* Restore stack pointer. */
515 cfi_adjust_cfa_offset(-FRAMESIZE)
516 ld r0, 16(r1) /* Restore the saved link register. */
519 ld r29, -24(r1) /* Restore callers save register r29. */
520 ld r30, -16(r1) /* Restore callers save register r30. */
521 ld r31, -8(r1) /* Restore callers save register r31. */
528 mtlr r0 /* Branch to link register. */
532 weak_alias (__strcasestr, strcasestr)
533 libc_hidden_def (__strcasestr)
534 libc_hidden_builtin_def (strcasestr)