sysdeps/powerpc/powerpc64/power6/memset.S

   1 /* Optimized 64-bit memset implementation for POWER6.
   2    Copyright (C) 1997-2014 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
  22    Returns 's'.
  23
  24    The memset is done in three sizes: byte (8 bits), word (32 bits),
  25    cache line (256 bits). There is a special case for setting cache lines
  26    to 0, to take advantage of the dcbz instruction.  */
  27
  28         .machine power6
  29 EALIGN (memset, 7, 0)
  30         CALL_MCOUNT 3
  31
  32 #define rTMP    r0
  33 #define rRTN    r3      /* Initial value of 1st argument.  */
  34 #define rMEMP0  r3      /* Original value of 1st arg.  */
  35 #define rCHR    r4      /* Char to set in each byte.  */
  36 #define rLEN    r5      /* Length of region to set.  */
  37 #define rMEMP   r6      /* Address at which we are storing.  */
  38 #define rALIGN  r7      /* Number of bytes we are setting now (when aligning). */
  39 #define rMEMP2  r8
  40 #define rMEMP3  r9      /* Alt mem pointer.  */
  41 L(_memset):
  42 /* Take care of case for size <= 4.  */
  43         cmpldi  cr1, rLEN, 8
  44         andi.   rALIGN, rMEMP0, 7
  45         mr      rMEMP, rMEMP0
  46         ble     cr1, L(small)
  47
  48 /* Align to doubleword boundary.  */
  49         cmpldi  cr5, rLEN, 31
  50         insrdi  rCHR, rCHR, 8, 48       /* Replicate byte to halfword.  */
  51         beq+    L(aligned2)
  52         mtcrf   0x01, rMEMP0
  53         subfic  rALIGN, rALIGN, 8
  54         cror    28,30,31                /* Detect odd word aligned.  */
  55         add     rMEMP, rMEMP, rALIGN
  56         sub     rLEN, rLEN, rALIGN
  57         insrdi  rCHR, rCHR, 16, 32      /* Replicate halfword to word.  */
  58         bt      29, L(g4)
  59 /* Process the even word of doubleword.  */
  60         bf+     31, L(g2)
  61         stb     rCHR, 0(rMEMP0)
  62         bt      30, L(g4x)
  63 L(g2):
  64         sth     rCHR, -6(rMEMP)
  65 L(g4x):
  66         stw     rCHR, -4(rMEMP)
  67         b       L(aligned)
  68 /* Process the odd word of doubleword.  */
  69 L(g4):
  70         bf      28, L(g4x) /* If false, word aligned on odd word.  */
  71         bf+     31, L(g0)
  72         stb     rCHR, 0(rMEMP0)
  73         bt      30, L(aligned)
  74 L(g0):
  75         sth     rCHR, -2(rMEMP)
  76
  77 /* Handle the case of size < 31.  */
  78 L(aligned2):
  79         insrdi  rCHR, rCHR, 16, 32      /* Replicate halfword to word.  */
  80 L(aligned):
  81         mtcrf   0x01, rLEN
  82         ble     cr5, L(medium)
  83 /* Align to 32-byte boundary.  */
  84         andi.   rALIGN, rMEMP, 0x18
  85         subfic  rALIGN, rALIGN, 0x20
  86         insrdi  rCHR, rCHR, 32, 0       /* Replicate word to double word. */
  87         beq     L(caligned)
  88         mtcrf   0x01, rALIGN
  89         add     rMEMP, rMEMP, rALIGN
  90         sub     rLEN, rLEN, rALIGN
  91         cmplwi  cr1, rALIGN, 0x10
  92         mr      rMEMP2, rMEMP
  93         bf      28, L(a1)
  94         stdu    rCHR, -8(rMEMP2)
  95 L(a1):  blt     cr1, L(a2)
  96         std     rCHR, -8(rMEMP2)
  97         stdu    rCHR, -16(rMEMP2)
  98 L(a2):
  99
 100 /* Now aligned to a 32 byte boundary.  */
 101         .align 4
 102 L(caligned):
 103         cmpldi  cr1, rCHR, 0
 104         clrrdi. rALIGN, rLEN, 5
 105         mtcrf   0x01, rLEN
 106         beq     cr1, L(zloopstart) /* Special case for clearing memory using dcbz.  */
 107         beq     L(medium)       /* We may not actually get to do a full line.  */
 108         .align 4
 109 /* Storing a non-zero "c" value. We are aligned at a sector (32-byte)
 110    boundary may not be at cache line (128-byte) boundary.  */
 111 L(nzloopstart):
 112 /* memset in 32-byte chunks until we get to a cache line boundary.
 113    If rLEN is less than the distance to the next cache-line boundary use
 114    cacheAligned1 code to finish the tail.  */
 115         cmpldi  cr1,rLEN,128
 116
 117         andi.   rTMP,rMEMP,127
 118         blt     cr1,L(cacheAligned1)
 119         addi    rMEMP3,rMEMP,32
 120         beq     L(nzCacheAligned)
 121         addi    rLEN,rLEN,-32
 122         std     rCHR,0(rMEMP)
 123         std     rCHR,8(rMEMP)
 124         std     rCHR,16(rMEMP)
 125         addi    rMEMP,rMEMP,32
 126         andi.   rTMP,rMEMP3,127
 127         std     rCHR,-8(rMEMP3)
 128
 129         beq     L(nzCacheAligned)
 130         addi    rLEN,rLEN,-32
 131         std     rCHR,0(rMEMP3)
 132         addi    rMEMP,rMEMP,32
 133         std     rCHR,8(rMEMP3)
 134         andi.   rTMP,rMEMP,127
 135         std     rCHR,16(rMEMP3)
 136         std     rCHR,24(rMEMP3)
 137
 138         beq     L(nzCacheAligned)
 139         addi    rLEN,rLEN,-32
 140         std     rCHR,32(rMEMP3)
 141         addi    rMEMP,rMEMP,32
 142         cmpldi  cr1,rLEN,128
 143         std     rCHR,40(rMEMP3)
 144         cmpldi  cr6,rLEN,256
 145         li      rMEMP2,128
 146         std     rCHR,48(rMEMP3)
 147         std     rCHR,56(rMEMP3)
 148         blt     cr1,L(cacheAligned1)
 149         b       L(nzCacheAligned128)
 150
 151 /* Now we are aligned to the cache line and can use dcbtst.  */
 152         .align 4
 153 L(nzCacheAligned):
 154         cmpldi  cr1,rLEN,128
 155         blt     cr1,L(cacheAligned1)
 156         b       L(nzCacheAligned128)
 157         .align 5
 158 L(nzCacheAligned128):
 159         cmpldi  cr1,rLEN,256
 160         addi    rMEMP3,rMEMP,64
 161         std     rCHR,0(rMEMP)
 162         std     rCHR,8(rMEMP)
 163         std     rCHR,16(rMEMP)
 164         std     rCHR,24(rMEMP)
 165         std     rCHR,32(rMEMP)
 166         std     rCHR,40(rMEMP)
 167         std     rCHR,48(rMEMP)
 168         std     rCHR,56(rMEMP)
 169         addi    rMEMP,rMEMP3,64
 170         addi    rLEN,rLEN,-128
 171         std     rCHR,0(rMEMP3)
 172         std     rCHR,8(rMEMP3)
 173         std     rCHR,16(rMEMP3)
 174         std     rCHR,24(rMEMP3)
 175         std     rCHR,32(rMEMP3)
 176         std     rCHR,40(rMEMP3)
 177         std     rCHR,48(rMEMP3)
 178         std     rCHR,56(rMEMP3)
 179         bge     cr1,L(nzCacheAligned128)
 180         dcbtst  0,rMEMP
 181         b       L(cacheAligned1)
 182         .align 5
 183 /* Storing a zero "c" value. We are aligned at a sector (32-byte)
 184    boundary but may not be at cache line (128-byte) boundary.  If the
 185    remaining length spans a full cache line we can use the Data cache
 186    block zero instruction. */
 187 L(zloopstart):
 188 /* memset in 32-byte chunks until we get to a cache line boundary.
 189    If rLEN is less than the distance to the next cache-line boundary use
 190    cacheAligned1 code to finish the tail.  */
 191         cmpldi  cr1,rLEN,128
 192         beq     L(medium)
 193 L(getCacheAligned):
 194         andi.   rTMP,rMEMP,127
 195         nop
 196         blt     cr1,L(cacheAligned1)
 197         addi    rMEMP3,rMEMP,32
 198         beq     L(cacheAligned)
 199         addi    rLEN,rLEN,-32
 200         std     rCHR,0(rMEMP)
 201         std     rCHR,8(rMEMP)
 202         std     rCHR,16(rMEMP)
 203         addi    rMEMP,rMEMP,32
 204         andi.   rTMP,rMEMP3,127
 205         std     rCHR,-8(rMEMP3)
 206 L(getCacheAligned2):
 207         beq     L(cacheAligned)
 208         addi    rLEN,rLEN,-32
 209         std     rCHR,0(rMEMP3)
 210         std     rCHR,8(rMEMP3)
 211         addi    rMEMP,rMEMP,32
 212         andi.   rTMP,rMEMP,127
 213         std     rCHR,16(rMEMP3)
 214         std     rCHR,24(rMEMP3)
 215 L(getCacheAligned3):
 216         beq     L(cacheAligned)
 217         addi    rLEN,rLEN,-32
 218         std     rCHR,32(rMEMP3)
 219         addi    rMEMP,rMEMP,32
 220         cmpldi  cr1,rLEN,128
 221         std     rCHR,40(rMEMP3)
 222         cmpldi  cr6,rLEN,256
 223         li      rMEMP2,128
 224         std     rCHR,48(rMEMP3)
 225         std     rCHR,56(rMEMP3)
 226         blt     cr1,L(cacheAligned1)
 227         blt     cr6,L(cacheAligned128)
 228         b       L(cacheAlignedx)
 229
 230 /* Now we are aligned to the cache line and can use dcbz.  */
 231         .align 5
 232 L(cacheAligned):
 233         cmpldi  cr1,rLEN,128
 234         cmpldi  cr6,rLEN,256
 235         blt     cr1,L(cacheAligned1)
 236         li      rMEMP2,128
 237 L(cacheAlignedx):
 238         cmpldi  cr5,rLEN,640
 239         blt     cr6,L(cacheAligned128)
 240         bgt     cr5,L(cacheAligned512)
 241         cmpldi  cr6,rLEN,512
 242         dcbz    0,rMEMP
 243         cmpldi  cr1,rLEN,384
 244         dcbz    rMEMP2,rMEMP
 245         addi    rMEMP,rMEMP,256
 246         addi    rLEN,rLEN,-256
 247         blt     cr1,L(cacheAligned1)
 248         blt     cr6,L(cacheAligned128)
 249         b       L(cacheAligned256)
 250         .align 5
 251 /* A simple loop for the longer (>640 bytes) lengths.  This form limits
 252    the branch miss-predicted to exactly 1 at loop exit.*/
 253 L(cacheAligned512):
 254         cmpli   cr1,rLEN,128
 255         blt     cr1,L(cacheAligned1)
 256         dcbz    0,rMEMP
 257         addi    rLEN,rLEN,-128
 258         addi    rMEMP,rMEMP,128
 259         b       L(cacheAligned512)
 260         .align 5
 261 L(cacheAligned256):
 262
 263         cmpldi  cr6,rLEN,512
 264
 265         dcbz    0,rMEMP
 266         cmpldi  cr1,rLEN,384
 267         dcbz    rMEMP2,rMEMP
 268         addi    rMEMP,rMEMP,256
 269         addi    rLEN,rLEN,-256
 270
 271         bge     cr6,L(cacheAligned256)
 272
 273         blt     cr1,L(cacheAligned1)
 274         .align 4
 275 L(cacheAligned128):
 276         dcbz    0,rMEMP
 277         addi    rMEMP,rMEMP,128
 278         addi    rLEN,rLEN,-128
 279         nop
 280 L(cacheAligned1):
 281         cmpldi  cr1,rLEN,32
 282         blt     cr1,L(handletail32)
 283         addi    rMEMP3,rMEMP,32
 284         addi    rLEN,rLEN,-32
 285         std     rCHR,0(rMEMP)
 286         std     rCHR,8(rMEMP)
 287         std     rCHR,16(rMEMP)
 288         addi    rMEMP,rMEMP,32
 289         cmpldi  cr1,rLEN,32
 290         std     rCHR,-8(rMEMP3)
 291 L(cacheAligned2):
 292         blt     cr1,L(handletail32)
 293         addi    rLEN,rLEN,-32
 294         std     rCHR,0(rMEMP3)
 295         std     rCHR,8(rMEMP3)
 296         addi    rMEMP,rMEMP,32
 297         cmpldi  cr1,rLEN,32
 298         std     rCHR,16(rMEMP3)
 299         std     rCHR,24(rMEMP3)
 300         nop
 301 L(cacheAligned3):
 302         blt     cr1,L(handletail32)
 303         addi    rMEMP,rMEMP,32
 304         addi    rLEN,rLEN,-32
 305         std     rCHR,32(rMEMP3)
 306         std     rCHR,40(rMEMP3)
 307         std     rCHR,48(rMEMP3)
 308         std     rCHR,56(rMEMP3)
 309
 310 /* We are here because the length or remainder (rLEN) is less than the
 311    cache line/sector size and does not justify aggressive loop unrolling.
 312    So set up the preconditions for L(medium) and go there.  */
 313         .align 3
 314 L(handletail32):
 315         cmpldi  cr1,rLEN,0
 316         beqlr   cr1
 317         b       L(medium)
 318
 319         .align 5
 320 L(small):
 321 /* Memset of 8 bytes or less.  */
 322         cmpldi  cr6, rLEN, 4
 323         cmpldi  cr5, rLEN, 1
 324         ble     cr6,L(le4)
 325         subi    rLEN, rLEN, 4
 326         stb     rCHR,0(rMEMP)
 327         stb     rCHR,1(rMEMP)
 328         stb     rCHR,2(rMEMP)
 329         stb     rCHR,3(rMEMP)
 330         addi    rMEMP,rMEMP, 4
 331         cmpldi  cr5, rLEN, 1
 332 L(le4):
 333         cmpldi  cr1, rLEN, 3
 334         bltlr   cr5
 335         stb     rCHR, 0(rMEMP)
 336         beqlr   cr5
 337         stb     rCHR, 1(rMEMP)
 338         bltlr   cr1
 339         stb     rCHR, 2(rMEMP)
 340         beqlr   cr1
 341         stb     rCHR, 3(rMEMP)
 342         blr
 343
 344 /* Memset of 0-31 bytes.  */
 345         .align 5
 346 L(medium):
 347         insrdi  rCHR, rCHR, 32, 0       /* Replicate word to double word.  */
 348         cmpldi  cr1, rLEN, 16
 349 L(medium_tail2):
 350         add     rMEMP, rMEMP, rLEN
 351 L(medium_tail):
 352         bt-     31, L(medium_31t)
 353         bt-     30, L(medium_30t)
 354 L(medium_30f):
 355         bt      29, L(medium_29t)
 356 L(medium_29f):
 357         bge     cr1, L(medium_27t)
 358         bflr    28
 359         std     rCHR, -8(rMEMP)
 360         blr
 361
 362 L(medium_31t):
 363         stbu    rCHR, -1(rMEMP)
 364         bf-     30, L(medium_30f)
 365 L(medium_30t):
 366         sthu    rCHR, -2(rMEMP)
 367         bf-     29, L(medium_29f)
 368 L(medium_29t):
 369         stwu    rCHR, -4(rMEMP)
 370         blt     cr1, L(medium_27f)
 371 L(medium_27t):
 372         std     rCHR, -8(rMEMP)
 373         stdu    rCHR, -16(rMEMP)
 374 L(medium_27f):
 375         bflr    28
 376 L(medium_28t):
 377         std     rCHR, -8(rMEMP)
 378         blr
 379 END_GEN_TB (memset,TB_TOCLESS)
 380 libc_hidden_builtin_def (memset)
 381
 382 /* Copied from bzero.S to prevent the linker from inserting a stub
 383    between bzero and memset.  */
 384 ENTRY (__bzero)
 385         CALL_MCOUNT 3
 386         mr      r5,r4
 387         li      r4,0
 388         b       L(_memset)
 389 END (__bzero)
 390 #ifndef __bzero
 391 weak_alias (__bzero, bzero)
 392 #endif