powerpc: Optimized st{r,p}ncpy for POWER8/PPC64
[platform/upstream/glibc.git] / sysdeps / powerpc / powerpc64 / power8 / strncpy.S
1 /* Optimized strncpy/stpncpy implementation for PowerPC64/POWER8.
2    Copyright (C) 2015 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4
5    The GNU C Library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 2.1 of the License, or (at your option) any later version.
9
10    The GNU C Library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14
15    You should have received a copy of the GNU Lesser General Public
16    License along with the GNU C Library; if not, see
17    <http://www.gnu.org/licenses/>.  */
18
19 #include <sysdep.h>
20
21 #ifdef USE_AS_STPNCPY
22 # define FUNC_NAME __stpncpy
23 #else
24 # define FUNC_NAME strncpy
25 #endif
26
27 /* Implements the function
28
29    char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
30
31    or
32
33    char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
34
35    if USE_AS_STPCPY is defined.
36
37    The implementation uses unaligned doubleword access to avoid specialized
38    code paths depending of data alignment.  Although recent powerpc64 uses
39    64K as default, the page cross handling assumes minimum page size of
40    4k.  */
41
42         .machine  power7
43 EALIGN (FUNC_NAME, 4, 0)
44
45         /* Check if the [src]+15 will cross a 4K page by checking if the bit
46            indicating the page size changes.  Basically:
47
48            uint64_t srcin = (uint64_t)src;
49            uint64_t ob = srcin & 4096UL;
50            uint64_t nb = (srcin+15UL) & 4096UL;
51            if (ob ^ nb)
52              goto pagecross;  */
53
54         addi    r10,r4,16
55         rlwinm  r9,r4,0,19,19
56
57         /* Since it is a leaf function, save some non-volatile registers on the
58            protected/red zone.  */
59         std     r26,-48(r1)
60         std     r27,-40(r1)
61
62         rlwinm  r8,r10,0,19,19
63
64         std     r28,-32(r1)
65         std     r29,-24(r1)
66
67         cmpld   r7,r9,r8
68
69         std     r30,-16(r1)
70         std     r31,-8(r1)
71
72         beq     cr7,L(unaligned_lt_16)
73         rldicl  r9,r4,0,61
74         subfic  r8,r9,8
75         cmpld   cr7,r5,r8
76         bgt     cr7,L(pagecross)
77
78         /* At this points there is 1 to 15 bytes to check and write.  Since it could
79            be either from first unaligned 16 bytes access or from bulk copy, the code
80            uses an unrolled byte read/write instead of trying to analyze the cmpb
81            results.  */
82 L(short_path):
83         mr      r9,r3
84 L(short_path_1):
85         cmpdi   cr7,r5,0
86         beq     cr7,L(short_path_loop_end_1)
87 L(short_path_2):
88         lbz     r10,0(r4)
89         cmpdi   cr7,r10,0
90         stb     r10,0(r9)
91         beq     cr7,L(zero_pad_start_1)
92         cmpdi   cr0,r5,1
93         addi    r8,r9,1
94         addi    r6,r5,-1
95         beq     cr0,L(short_path_loop_end_0)
96         lbz     r10,1(r4)
97         cmpdi   cr7,r10,0
98         stb     r10,1(r9)
99         beq     cr7,L(zero_pad_start_prepare_1)
100         addi    r10,r5,-3
101         b       L(short_path_loop_1)
102
103         .align  4
104 L(short_path_loop):
105         lbz     r8,0(r4)
106         addi    r7,r10,-2
107         cmpdi   cr5,r8,0
108         stb     r8,0(r9)
109         beq     cr5,L(zero_pad_start_1)
110         beq     r7,L(short_path_loop_end_0)
111         lbz     r8,1(r4)
112         cmpdi   cr7,r8,0
113         stb     r8,1(r9)
114         beq     cr7,L(zero_pad_start)
115         mr      r10,r7
116 L(short_path_loop_1):
117         addic.  r5,r5,-2
118         addi    r9,r9,2
119         cmpdi   cr7,r10,0
120         addi    r4,r4,2
121         addi    r6,r9,1
122         bne     cr0,L(short_path_loop)
123 #ifdef USE_AS_STPNCPY
124         mr      r3,r9
125         b       L(short_path_loop_end)
126 #endif
127
128 L(short_path_loop_end_0):
129 #ifdef USE_AS_STPNCPY
130         addi    r3,r9,1
131         b       L(short_path_loop_end)
132 #endif
133 L(short_path_loop_end_1):
134 #ifdef USE_AS_STPNCPY
135         mr      r3,r9
136 #endif
137 L(short_path_loop_end):
138         /* Restore non-volatile registers.  */
139         ld      r26,-48(r1)
140         ld      r27,-40(r1)
141         ld      r28,-32(r1)
142         ld      r29,-24(r1)
143         ld      r30,-16(r1)
144         ld      r31,-8(r1)
145         blr
146
147         /* This code pads the remainder dest with NULL bytes.  The algorithm
148            calculate the remanining size and issues a doubleword unrolled
149            loops followed by a byte a byte set.  */
150         .align  4
151 L(zero_pad_start):
152         mr      r5,r10
153         mr      r9,r6
154 L(zero_pad_start_1):
155         srdi.   r8,r5,r3
156         mr      r10,r9
157 #ifdef USE_AS_STPNCPY
158         mr      r3,r9
159 #endif
160         beq-    cr0,L(zero_pad_loop_b_start)
161         cmpldi  cr7,r8,1
162         li      cr7,0
163         std     r7,0(r9)
164         beq     cr7,L(zero_pad_loop_b_prepare)
165         addic.  r8,r8,-2
166         addi    r10,r9,r16
167         std     r7,8(r9)
168         beq     cr0,L(zero_pad_loop_dw_2)
169         std     r7,16(r9)
170         li      r9,0
171         b       L(zero_pad_loop_dw_1)
172
173         .align  4
174 L(zero_pad_loop_dw):
175         addi    r10,r10,16
176         std     r9,-8(r10)
177         beq     cr0,L(zero_pad_loop_dw_2)
178         std     r9,0(r10)
179 L(zero_pad_loop_dw_1):
180         cmpldi  cr7,r8,1
181         std     r9,0(r10)
182         addic.  r8,r8,-2
183         bne     cr7,L(zero_pad_loop_dw)
184         addi    r10,r10,8
185 L(zero_pad_loop_dw_2):
186         rldicl  r5,r5,0,61
187 L(zero_pad_loop_b_start):
188         cmpdi   cr7,r5,0
189         addi    r5,r5,-1
190         addi    r9,r10,-1
191         add     r10,r10,5
192         subf    r10,r9,r10
193         li      r8,0
194         beq-    cr7,L(short_path_loop_end)
195
196         /* Write remaining 1-8 bytes.  */
197         .align  4
198         addi    r9,r9,1
199         mtocrf  0x1,r10
200         bf      29,4f
201         stw     r8,0(r9)
202         addi    r9,r9,4
203
204         .align  4
205 4:      bf      30,2f
206         sth     r8,0(r9)
207         addi    r9,r9,2
208
209         .align  4
210 2:      bf      31,1f
211         stb     r8,0(r9)
212
213         /* Restore non-volatile registers.  */
214 1:      ld      r26,-48(r1)
215         ld      r27,-40(r1)
216         ld      r28,-32(r1)
217         ld      r29,-24(r1)
218         ld      r30,-16(r1)
219         ld      r31,-8(r1)
220         blr
221
222         /* The common case where [src]+16 will not cross a 4K page boundary.
223            In this case the code fast check the first 16 bytes by using doubleword
224            read/compares and update destiny if neither total size or null byte
225            is found in destiny. */
226         .align  4
227 L(unaligned_lt_16):
228         cmpldi  cr7,r5,7
229         ble     cr7,L(short_path)
230         ld      r7,0(r4)
231         li      r8,0
232         cmpb    r8,r7,r8
233         cmpdi   cr7,r8,0
234         bne     cr7,L(short_path_prepare_2)
235         addi    r6,r5,-8
236         std     r7,0(r3)
237         addi    r9,r3,r8
238         cmpldi  cr7,r6,7
239         addi    r7,r4,8
240         ble     cr7,L(short_path_prepare_1_1)
241         ld      r4,8(r4)
242         cmpb    r8,r4,r8
243         cmpdi   cr7,r8,0
244         bne     cr7,L(short_path_prepare_2_1)
245         std     r4,8(r3)
246         addi    r29,r3,16
247         addi    r5,r5,-16
248         /* Neither the null byte was found or total length was reached,
249            align to 16 bytes and issue a bulk copy/compare.  */
250         b       L(align_to_16b)
251
252         /* In the case of 4k page boundary cross, the algorithm first align
253            the address to a doubleword, calculate a mask based on alignment
254            to ignore the bytes and continue using doubleword.  */
255         .align  4
256 L(pagecross):
257         rldicr  r11,r4,0,59     /* Align the address to 8 bytes boundary.  */
258         li      r6,-1           /* MASK = 0xffffffffffffffffUL.  */
259         sldi    r9,r9,3         /* Calculate padding.  */
260         ld      r7,0(r11)       /* Load doubleword from memory.  */
261 #ifdef __LITTLE_ENDIAN__
262         sld     r9,r6,r9        /* MASK = MASK << padding.  */
263 #else
264         srd     r9,r6,r9        /* MASK = MASK >> padding.  */
265 #endif
266         orc     r9,r7,r9        /* Mask bits that are not part of the
267                                    string.  */
268         li      cr7,0
269         cmpb    r9,r9,r7        /* Check for null bytes in DWORD1.  */
270         cmpdi   cr7,r9,0
271         bne     cr7,L(short_path_prepare_2)
272         subf    r8,r8,r5        /* Adjust total length.  */
273         cmpldi  cr7,r8,8        /* Check if length was reached.  */
274         ble     cr7,L(short_path_prepare_2)
275
276         /* For next checks we have aligned address, so we check for more
277            three doublewords to make sure we can read 16 unaligned bytes
278            to start the bulk copy with 16 aligned addresses.  */
279         ld      cr7,8(r11)
280         cmpb    r9,r7,r9
281         cmpdi   cr7,r9,0
282         bne     cr7,L(short_path_prepare_2)
283         addi    cr7,r8,-8
284         cmpldi  cr7,r7,8
285         ble     cr7,L(short_path_prepare_2)
286         ld      cr7,16(r11)
287         cmpb    r9,r7,r9
288         cmpdi   cr7,r9,0
289         bne     cr7,L(short_path_prepare_2)
290         addi    r8,r8,-16
291         cmpldi  r7,r8,8
292         ble     cr7,L(short_path_prepare_2)
293         ld      r8,24(r11)
294         cmpb    r9,r8,r9
295         cmpdi   r7,r9,0
296         bne     cr7,L(short_path_prepare_2)
297
298         /* No null byte found in the 32 bytes readed and length not reached,
299            read source again using unaligned loads and store them.  */
300         ld      r9,0(r4)
301         addi    r29,r3,16
302         addi    r5,r5,-16
303         std     r9,0(r3)
304         ld      r9,8(r4)
305         std     r9,8(r3)
306
307         /* Align source to 16 bytes and adjust destiny and size.  */
308 L(align_to_16b):
309         rldicl  r9,r10,0,60
310         rldicr  r28,r10,0,59
311         add     r12,r5,r9
312         subf    r29,r9,r29
313
314         /* The bulk read/compare/copy loads two doublewords, compare and merge
315            in a single register for speed.  This is an attempt to speed up the
316            null-checking process for bigger strings.  */
317
318         cmpldi  cr7,r12,15
319         ble     cr7,L(short_path_prepare_1_2)
320
321         /* Main loop for large sizes, unrolled 2 times to get better use of
322            pipeline.  */
323         ld      r8,0(28)
324         ld      r10,8(28)
325         li      r9,0
326         cmpb    r7,r8,r9
327         cmpb    r9,r10,r9
328         or.     r6,r9,r7
329         bne     cr0,L(short_path_prepare_2_3)
330         addi    r5,r12,-16
331         addi    r4,r28,16
332         std     r8,0(r29)
333         std     r10,8(r29)
334         cmpldi  cr7,r5,15
335         addi    r9,r29,16
336         ble     cr7,L(short_path_1)
337         mr      r11,r28
338         mr      r6,r29
339         li      r30,0
340         subfic  r26,r4,48
341         subfic  r27,r9,48
342
343         b       L(loop_16b)
344
345         .align  4
346 L(loop_start):
347         ld      r31,0(r11)
348         ld      r10,8(r11)
349         cmpb    r0,r31,r7
350         cmpb    r8,r10,r7
351         or.     r7,r0,r8
352         addi    r5,r5,-32
353         cmpldi  cr7,r5,15
354         add     r4,r4,r26
355         add     r9,r9,r27
356         bne     cr0,L(short_path_prepare_2_2)
357         add     r4,r28,r4
358         std     r31,0(r6)
359         add     r9,r29,r9
360         std     r10,8(r6)
361         ble     cr7,L(short_path_1)
362
363 L(loop_16b):
364         ld      r10,16(r11)
365         ld      r0,24(r11)
366         cmpb    r8,r10,r30
367         cmpb    r7,r0,r30
368         or.     r7,r8,r7
369         addi    r12,r12,-32
370         cmpldi  r7,r12,15
371         addi    r11,r11,32
372         bne     cr0,L(short_path_2)
373         std     r10,16(r6)
374         addi    r6,r6,32
375         std     r0,-8(r6)
376         bgt     cr7,L(loop_start)
377
378         mr      r5,r12
379         mr      r4,r11
380         mr      r9,r6
381         b       L(short_path_1)
382
383         .align  4
384 L(short_path_prepare_1_1):
385         mr      r5,r6
386         mr      r4,r7
387         b       L(short_path_1)
388 L(short_path_prepare_1_2):
389         mr      r5,r12
390         mr      r4,r28
391         mr      r9,r29
392         b       L(short_path_1)
393 L(short_path_prepare_2):
394         mr      r9,r3
395         b       L(short_path_2)
396 L(short_path_prepare_2_1):
397         mr      r5,r6
398         mr      r4,r7
399         b       L(short_path_2)
400 L(short_path_prepare_2_2):
401         mr      r5,r12
402         mr      r4,r11
403         mr      r9,r6
404         b       L(short_path_2)
405 L(short_path_prepare_2_3):
406         mr      r5,r12
407         mr      r4,r28
408         mr      r9,r29
409         b       L(short_path_2)
410 L(zero_pad_loop_b_prepare):
411         addi    r10,r9,8
412         rldicl  r5,r5,0,61
413         b       L(zero_pad_loop_b_start)
414 L(zero_pad_start_prepare_1):
415         mr      r5,r6
416         mr      r9,r8
417         b       L(zero_pad_start_1)
418 END (FUNC_NAME)
419
420 #ifdef USE_AS_STPNCPY
421 libc_hidden_def (__stpncpy)
422 #else
423 libc_hidden_builtin_def (strncpy)
424 #endif