1 /* Optimized memset implementation for PowerPC64/POWER7.
2 Copyright (C) 2010 Free Software Foundation, Inc.
3 Contributed by Luis Machado <luisgpm@br.ibm.com>.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
24 /* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
28 EALIGN (BP_SYM (memset), 5, 0)
36 /* Replicate byte to word. */
39 ble cr6,L(small) /* If length <= 8, use short copy code. */
42 ble cr7,L(medium) /* If length < 32, use medium copy code. */
44 andi. 11,10,7 /* Check alignment of SRC. */
45 insrdi 4,4,32,0 /* Replicate word to double word. */
54 /* Get DST aligned to 8 bytes. */
63 4: bf 29,L(big_aligned)
75 srdi 9,5,3 /* Number of full doublewords remaining. */
80 /* From this point on, we'll copy 32+ bytes and the value
81 isn't 0 (so we can't use dcbz). */
89 /* Copy 1~3 doublewords so the main loop starts
90 at a multiple of 32 bytes. */
102 blt cr1,L(tail_bytes)
106 1: /* Copy 1 doubleword. */
112 /* Main aligned copy loop. Copies 32-bytes at a time and
113 ping-pong through r10 and r12 to avoid AGEN delays. */
136 /* Check for tail bytes. */
142 /* At this point we have a tail of 0-7 bytes and we know that the
143 destination is doubleword-aligned. */
144 4: /* Copy 4 bytes. */
149 2: /* Copy 2 bytes. */
154 1: /* Copy 1 byte. */
160 /* Special case when value is 0 and we have a long length to deal
161 with. Use dcbz to zero out 128-bytes at a time. Before using
162 dcbz though, we need to get the destination 128-bytes aligned. */
174 /* Get DST aligned to 128 bytes. */
201 1: bf 31,L(huge_aligned)
219 /* Check how many bytes are still left. */
228 /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for
229 speed. We'll handle the resulting tail bytes later. */
263 /* Handle the rest of the tail bytes here. */
283 /* Expanded tree to copy tail bytes without increments. */
318 /* Handle copies of 9~31 bytes. */
321 /* At least 9 bytes to go. */
324 beq L(medium_aligned)
326 /* Force 4-bytes alignment for SRC. */
329 1: /* Copy 1 byte. */
334 2: /* Copy 2 bytes. */
335 bf 30,L(medium_aligned)
342 /* At least 6 bytes to go, and DST is word-aligned. */
353 8: /* Copy 8 bytes. */
359 4: /* Copy 4 bytes. */
364 2: /* Copy 2-3 bytes. */
369 1: /* Copy 1 byte. */
375 /* Handles copies of 0~8 bytes. */
385 END_GEN_TB (BP_SYM (memset),TB_TOCLESS)
386 libc_hidden_builtin_def (memset)
388 /* Copied from bzero.S to prevent the linker from inserting a stub
389 between bzero and memset. */
390 ENTRY (BP_SYM (__bzero))
395 END_GEN_TB (BP_SYM (__bzero),TB_TOCLESS)
397 weak_alias (BP_SYM (__bzero), BP_SYM (bzero))