arch/xtensa/lib/memcopy.S

   1 /*
   2  * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
   3  * xthal_memcpy and xthal_bcopy
   4  *
   5  * This file is subject to the terms and conditions of the GNU General Public
   6  * License.  See the file "COPYING" in the main directory of this archive
   7  * for more details.
   8  *
   9  * Copyright (C) 2002 - 2012 Tensilica Inc.
  10  */
  11
  12 #include <variant/core.h>
  13
  14         .macro  src_b   r, w0, w1
  15 #ifdef __XTENSA_EB__
  16         src     \r, \w0, \w1
  17 #else
  18         src     \r, \w1, \w0
  19 #endif
  20         .endm
  21
  22         .macro  ssa8    r
  23 #ifdef __XTENSA_EB__
  24         ssa8b   \r
  25 #else
  26         ssa8l   \r
  27 #endif
  28         .endm
  29
  30 /*
  31  * void *memcpy(void *dst, const void *src, size_t len);
  32  *
  33  * This function is intended to do the same thing as the standard
  34  * library function memcpy() for most cases.
  35  * However, where the source and/or destination references
  36  * an instruction RAM or ROM or a data RAM or ROM, that
  37  * source and/or destination will always be accessed with
  38  * 32-bit load and store instructions (as required for these
  39  * types of devices).
  40  *
  41  * !!!!!!!  XTFIXME:
  42  * !!!!!!!  Handling of IRAM/IROM has not yet
  43  * !!!!!!!  been implemented.
  44  *
  45  * The (general case) algorithm is as follows:
  46  *   If destination is unaligned, align it by conditionally
  47  *     copying 1 and 2 bytes.
  48  *   If source is aligned,
  49  *     do 16 bytes with a loop, and then finish up with
  50  *     8, 4, 2, and 1 byte copies conditional on the length;
  51  *   else (if source is unaligned),
  52  *     do the same, but use SRC to align the source data.
  53  *   This code tries to use fall-through branches for the common
  54  *     case of aligned source and destination and multiple
  55  *     of 4 (or 8) length.
  56  *
  57  * Register use:
  58  *      a0/ return address
  59  *      a1/ stack pointer
  60  *      a2/ return value
  61  *      a3/ src
  62  *      a4/ length
  63  *      a5/ dst
  64  *      a6/ tmp
  65  *      a7/ tmp
  66  *      a8/ tmp
  67  *      a9/ tmp
  68  *      a10/ tmp
  69  *      a11/ tmp
  70  */
  71
  72         .text
  73
  74 /*
  75  * Byte by byte copy
  76  */
  77         .align  4
  78         .byte   0               # 1 mod 4 alignment for LOOPNEZ
  79                                 # (0 mod 4 alignment for LBEG)
  80 .Lbytecopy:
  81 #if XCHAL_HAVE_LOOPS
  82         loopnez a4, .Lbytecopydone
  83 #else /* !XCHAL_HAVE_LOOPS */
  84         beqz    a4, .Lbytecopydone
  85         add     a7, a3, a4      # a7 = end address for source
  86 #endif /* !XCHAL_HAVE_LOOPS */
  87 .Lnextbyte:
  88         l8ui    a6, a3, 0
  89         addi    a3, a3, 1
  90         s8i     a6, a5, 0
  91         addi    a5, a5, 1
  92 #if !XCHAL_HAVE_LOOPS
  93         bne     a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
  94 #endif /* !XCHAL_HAVE_LOOPS */
  95 .Lbytecopydone:
  96         retw
  97
  98 /*
  99  * Destination is unaligned
 100  */
 101
 102         .align  4
 103 .Ldst1mod2:     # dst is only byte aligned
 104         _bltui  a4, 7, .Lbytecopy       # do short copies byte by byte
 105
 106         # copy 1 byte
 107         l8ui    a6, a3,  0
 108         addi    a3, a3,  1
 109         addi    a4, a4, -1
 110         s8i     a6, a5,  0
 111         addi    a5, a5,  1
 112         _bbci.l a5, 1, .Ldstaligned     # if dst is now aligned, then
 113                                         # return to main algorithm
 114 .Ldst2mod4:     # dst 16-bit aligned
 115         # copy 2 bytes
 116         _bltui  a4, 6, .Lbytecopy       # do short copies byte by byte
 117         l8ui    a6, a3,  0
 118         l8ui    a7, a3,  1
 119         addi    a3, a3,  2
 120         addi    a4, a4, -2
 121         s8i     a6, a5,  0
 122         s8i     a7, a5,  1
 123         addi    a5, a5,  2
 124         j       .Ldstaligned    # dst is now aligned, return to main algorithm
 125
 126         .align  4
 127         .global memcpy
 128         .type   memcpy,@function
 129 memcpy:
 130
 131         entry   sp, 16          # minimal stack frame
 132         # a2/ dst, a3/ src, a4/ len
 133         mov     a5, a2          # copy dst so that a2 is return value
 134 .Lcommon:
 135         _bbsi.l a2, 0, .Ldst1mod2       # if dst is 1 mod 2
 136         _bbsi.l a2, 1, .Ldst2mod4       # if dst is 2 mod 4
 137 .Ldstaligned:   # return here from .Ldst?mod? once dst is aligned
 138         srli    a7, a4, 4       # number of loop iterations with 16B
 139                                 # per iteration
 140         movi    a8, 3           # if source is not aligned,
 141         _bany   a3, a8, .Lsrcunaligned  # then use shifting copy
 142         /*
 143          * Destination and source are word-aligned, use word copy.
 144          */
 145         # copy 16 bytes per iteration for word-aligned dst and word-aligned src
 146 #if XCHAL_HAVE_LOOPS
 147         loopnez a7, .Loop1done
 148 #else /* !XCHAL_HAVE_LOOPS */
 149         beqz    a7, .Loop1done
 150         slli    a8, a7, 4
 151         add     a8, a8, a3      # a8 = end of last 16B source chunk
 152 #endif /* !XCHAL_HAVE_LOOPS */
 153 .Loop1:
 154         l32i    a6, a3,  0
 155         l32i    a7, a3,  4
 156         s32i    a6, a5,  0
 157         l32i    a6, a3,  8
 158         s32i    a7, a5,  4
 159         l32i    a7, a3, 12
 160         s32i    a6, a5,  8
 161         addi    a3, a3, 16
 162         s32i    a7, a5, 12
 163         addi    a5, a5, 16
 164 #if !XCHAL_HAVE_LOOPS
 165         bne     a3, a8, .Loop1  # continue loop if a3:src != a8:src_end
 166 #endif /* !XCHAL_HAVE_LOOPS */
 167 .Loop1done:
 168         bbci.l  a4, 3, .L2
 169         # copy 8 bytes
 170         l32i    a6, a3,  0
 171         l32i    a7, a3,  4
 172         addi    a3, a3,  8
 173         s32i    a6, a5,  0
 174         s32i    a7, a5,  4
 175         addi    a5, a5,  8
 176 .L2:
 177         bbsi.l  a4, 2, .L3
 178         bbsi.l  a4, 1, .L4
 179         bbsi.l  a4, 0, .L5
 180         retw
 181 .L3:
 182         # copy 4 bytes
 183         l32i    a6, a3,  0
 184         addi    a3, a3,  4
 185         s32i    a6, a5,  0
 186         addi    a5, a5,  4
 187         bbsi.l  a4, 1, .L4
 188         bbsi.l  a4, 0, .L5
 189         retw
 190 .L4:
 191         # copy 2 bytes
 192         l16ui   a6, a3,  0
 193         addi    a3, a3,  2
 194         s16i    a6, a5,  0
 195         addi    a5, a5,  2
 196         bbsi.l  a4, 0, .L5
 197         retw
 198 .L5:
 199         # copy 1 byte
 200         l8ui    a6, a3,  0
 201         s8i     a6, a5,  0
 202         retw
 203
 204 /*
 205  * Destination is aligned, Source is unaligned
 206  */
 207
 208         .align  4
 209 .Lsrcunaligned:
 210         _beqz   a4, .Ldone      # avoid loading anything for zero-length copies
 211         # copy 16 bytes per iteration for word-aligned dst and unaligned src
 212         ssa8    a3              # set shift amount from byte offset
 213
 214 /* set to 1 when running on ISS (simulator) with the
 215    lint or ferret client, or 0 to save a few cycles */
 216 #define SIM_CHECKS_ALIGNMENT    1
 217 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
 218         and     a11, a3, a8     # save unalignment offset for below
 219         sub     a3, a3, a11     # align a3
 220 #endif
 221         l32i    a6, a3, 0       # load first word
 222 #if XCHAL_HAVE_LOOPS
 223         loopnez a7, .Loop2done
 224 #else /* !XCHAL_HAVE_LOOPS */
 225         beqz    a7, .Loop2done
 226         slli    a10, a7, 4
 227         add     a10, a10, a3    # a10 = end of last 16B source chunk
 228 #endif /* !XCHAL_HAVE_LOOPS */
 229 .Loop2:
 230         l32i    a7, a3,  4
 231         l32i    a8, a3,  8
 232         src_b   a6, a6, a7
 233         s32i    a6, a5,  0
 234         l32i    a9, a3, 12
 235         src_b   a7, a7, a8
 236         s32i    a7, a5,  4
 237         l32i    a6, a3, 16
 238         src_b   a8, a8, a9
 239         s32i    a8, a5,  8
 240         addi    a3, a3, 16
 241         src_b   a9, a9, a6
 242         s32i    a9, a5, 12
 243         addi    a5, a5, 16
 244 #if !XCHAL_HAVE_LOOPS
 245         bne     a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
 246 #endif /* !XCHAL_HAVE_LOOPS */
 247 .Loop2done:
 248         bbci.l  a4, 3, .L12
 249         # copy 8 bytes
 250         l32i    a7, a3,  4
 251         l32i    a8, a3,  8
 252         src_b   a6, a6, a7
 253         s32i    a6, a5,  0
 254         addi    a3, a3,  8
 255         src_b   a7, a7, a8
 256         s32i    a7, a5,  4
 257         addi    a5, a5,  8
 258         mov     a6, a8
 259 .L12:
 260         bbci.l  a4, 2, .L13
 261         # copy 4 bytes
 262         l32i    a7, a3,  4
 263         addi    a3, a3,  4
 264         src_b   a6, a6, a7
 265         s32i    a6, a5,  0
 266         addi    a5, a5,  4
 267         mov     a6, a7
 268 .L13:
 269 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
 270         add     a3, a3, a11     # readjust a3 with correct misalignment
 271 #endif
 272         bbsi.l  a4, 1, .L14
 273         bbsi.l  a4, 0, .L15
 274 .Ldone: retw
 275 .L14:
 276         # copy 2 bytes
 277         l8ui    a6, a3,  0
 278         l8ui    a7, a3,  1
 279         addi    a3, a3,  2
 280         s8i     a6, a5,  0
 281         s8i     a7, a5,  1
 282         addi    a5, a5,  2
 283         bbsi.l  a4, 0, .L15
 284         retw
 285 .L15:
 286         # copy 1 byte
 287         l8ui    a6, a3,  0
 288         s8i     a6, a5,  0
 289         retw
 290
 291
 292 /*
 293  * void bcopy(const void *src, void *dest, size_t n);
 294  */
 295         .align  4
 296         .global bcopy
 297         .type   bcopy,@function
 298 bcopy:
 299         entry   sp, 16          # minimal stack frame
 300         # a2=src, a3=dst, a4=len
 301         mov     a5, a3
 302         mov     a3, a2
 303         mov     a2, a5
 304         j       .Lmovecommon    # go to common code for memmove+bcopy
 305
 306 /*
 307  * void *memmove(void *dst, const void *src, size_t len);
 308  *
 309  * This function is intended to do the same thing as the standard
 310  * library function memmove() for most cases.
 311  * However, where the source and/or destination references
 312  * an instruction RAM or ROM or a data RAM or ROM, that
 313  * source and/or destination will always be accessed with
 314  * 32-bit load and store instructions (as required for these
 315  * types of devices).
 316  *
 317  * !!!!!!!  XTFIXME:
 318  * !!!!!!!  Handling of IRAM/IROM has not yet
 319  * !!!!!!!  been implemented.
 320  *
 321  * The (general case) algorithm is as follows:
 322  *   If end of source doesn't overlap destination then use memcpy.
 323  *   Otherwise do memcpy backwards.
 324  *
 325  * Register use:
 326  *      a0/ return address
 327  *      a1/ stack pointer
 328  *      a2/ return value
 329  *      a3/ src
 330  *      a4/ length
 331  *      a5/ dst
 332  *      a6/ tmp
 333  *      a7/ tmp
 334  *      a8/ tmp
 335  *      a9/ tmp
 336  *      a10/ tmp
 337  *      a11/ tmp
 338  */
 339
 340 /*
 341  * Byte by byte copy
 342  */
 343         .align  4
 344         .byte   0               # 1 mod 4 alignment for LOOPNEZ
 345                                 # (0 mod 4 alignment for LBEG)
 346 .Lbackbytecopy:
 347 #if XCHAL_HAVE_LOOPS
 348         loopnez a4, .Lbackbytecopydone
 349 #else /* !XCHAL_HAVE_LOOPS */
 350         beqz    a4, .Lbackbytecopydone
 351         sub     a7, a3, a4      # a7 = start address for source
 352 #endif /* !XCHAL_HAVE_LOOPS */
 353 .Lbacknextbyte:
 354         addi    a3, a3, -1
 355         l8ui    a6, a3, 0
 356         addi    a5, a5, -1
 357         s8i     a6, a5, 0
 358 #if !XCHAL_HAVE_LOOPS
 359         bne     a3, a7, .Lbacknextbyte # continue loop if
 360                                        # $a3:src != $a7:src_start
 361 #endif /* !XCHAL_HAVE_LOOPS */
 362 .Lbackbytecopydone:
 363         retw
 364
 365 /*
 366  * Destination is unaligned
 367  */
 368
 369         .align  4
 370 .Lbackdst1mod2: # dst is only byte aligned
 371         _bltui  a4, 7, .Lbackbytecopy   # do short copies byte by byte
 372
 373         # copy 1 byte
 374         addi    a3, a3, -1
 375         l8ui    a6, a3,  0
 376         addi    a5, a5, -1
 377         s8i     a6, a5,  0
 378         addi    a4, a4, -1
 379         _bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then
 380                                         # return to main algorithm
 381 .Lbackdst2mod4: # dst 16-bit aligned
 382         # copy 2 bytes
 383         _bltui  a4, 6, .Lbackbytecopy   # do short copies byte by byte
 384         addi    a3, a3, -2
 385         l8ui    a6, a3,  0
 386         l8ui    a7, a3,  1
 387         addi    a5, a5, -2
 388         s8i     a6, a5,  0
 389         s8i     a7, a5,  1
 390         addi    a4, a4, -2
 391         j       .Lbackdstaligned        # dst is now aligned,
 392                                         # return to main algorithm
 393
 394         .align  4
 395         .global memmove
 396         .type   memmove,@function
 397 memmove:
 398
 399         entry   sp, 16          # minimal stack frame
 400         # a2/ dst, a3/ src, a4/ len
 401         mov     a5, a2          # copy dst so that a2 is return value
 402 .Lmovecommon:
 403         sub     a6, a5, a3
 404         bgeu    a6, a4, .Lcommon
 405
 406         add     a5, a5, a4
 407         add     a3, a3, a4
 408
 409         _bbsi.l a5, 0, .Lbackdst1mod2   # if dst is 1 mod 2
 410         _bbsi.l a5, 1, .Lbackdst2mod4   # if dst is 2 mod 4
 411 .Lbackdstaligned:       # return here from .Lbackdst?mod? once dst is aligned
 412         srli    a7, a4, 4       # number of loop iterations with 16B
 413                                 # per iteration
 414         movi    a8, 3           # if source is not aligned,
 415         _bany   a3, a8, .Lbacksrcunaligned      # then use shifting copy
 416         /*
 417          * Destination and source are word-aligned, use word copy.
 418          */
 419         # copy 16 bytes per iteration for word-aligned dst and word-aligned src
 420 #if XCHAL_HAVE_LOOPS
 421         loopnez a7, .backLoop1done
 422 #else /* !XCHAL_HAVE_LOOPS */
 423         beqz    a7, .backLoop1done
 424         slli    a8, a7, 4
 425         sub     a8, a3, a8      # a8 = start of first 16B source chunk
 426 #endif /* !XCHAL_HAVE_LOOPS */
 427 .backLoop1:
 428         addi    a3, a3, -16
 429         l32i    a7, a3, 12
 430         l32i    a6, a3,  8
 431         addi    a5, a5, -16
 432         s32i    a7, a5, 12
 433         l32i    a7, a3,  4
 434         s32i    a6, a5,  8
 435         l32i    a6, a3,  0
 436         s32i    a7, a5,  4
 437         s32i    a6, a5,  0
 438 #if !XCHAL_HAVE_LOOPS
 439         bne     a3, a8, .backLoop1  # continue loop if a3:src != a8:src_start
 440 #endif /* !XCHAL_HAVE_LOOPS */
 441 .backLoop1done:
 442         bbci.l  a4, 3, .Lback2
 443         # copy 8 bytes
 444         addi    a3, a3, -8
 445         l32i    a6, a3,  0
 446         l32i    a7, a3,  4
 447         addi    a5, a5, -8
 448         s32i    a6, a5,  0
 449         s32i    a7, a5,  4
 450 .Lback2:
 451         bbsi.l  a4, 2, .Lback3
 452         bbsi.l  a4, 1, .Lback4
 453         bbsi.l  a4, 0, .Lback5
 454         retw
 455 .Lback3:
 456         # copy 4 bytes
 457         addi    a3, a3, -4
 458         l32i    a6, a3,  0
 459         addi    a5, a5, -4
 460         s32i    a6, a5,  0
 461         bbsi.l  a4, 1, .Lback4
 462         bbsi.l  a4, 0, .Lback5
 463         retw
 464 .Lback4:
 465         # copy 2 bytes
 466         addi    a3, a3, -2
 467         l16ui   a6, a3,  0
 468         addi    a5, a5, -2
 469         s16i    a6, a5,  0
 470         bbsi.l  a4, 0, .Lback5
 471         retw
 472 .Lback5:
 473         # copy 1 byte
 474         addi    a3, a3, -1
 475         l8ui    a6, a3,  0
 476         addi    a5, a5, -1
 477         s8i     a6, a5,  0
 478         retw
 479
 480 /*
 481  * Destination is aligned, Source is unaligned
 482  */
 483
 484         .align  4
 485 .Lbacksrcunaligned:
 486         _beqz   a4, .Lbackdone  # avoid loading anything for zero-length copies
 487         # copy 16 bytes per iteration for word-aligned dst and unaligned src
 488         ssa8    a3              # set shift amount from byte offset
 489 #define SIM_CHECKS_ALIGNMENT    1       /* set to 1 when running on ISS with
 490                                          * the lint or ferret client, or 0
 491                                          * to save a few cycles */
 492 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
 493         and     a11, a3, a8     # save unalignment offset for below
 494         sub     a3, a3, a11     # align a3
 495 #endif
 496         l32i    a6, a3, 0       # load first word
 497 #if XCHAL_HAVE_LOOPS
 498         loopnez a7, .backLoop2done
 499 #else /* !XCHAL_HAVE_LOOPS */
 500         beqz    a7, .backLoop2done
 501         slli    a10, a7, 4
 502         sub     a10, a3, a10    # a10 = start of first 16B source chunk
 503 #endif /* !XCHAL_HAVE_LOOPS */
 504 .backLoop2:
 505         addi    a3, a3, -16
 506         l32i    a7, a3, 12
 507         l32i    a8, a3,  8
 508         addi    a5, a5, -16
 509         src_b   a6, a7, a6
 510         s32i    a6, a5, 12
 511         l32i    a9, a3,  4
 512         src_b   a7, a8, a7
 513         s32i    a7, a5,  8
 514         l32i    a6, a3,  0
 515         src_b   a8, a9, a8
 516         s32i    a8, a5,  4
 517         src_b   a9, a6, a9
 518         s32i    a9, a5,  0
 519 #if !XCHAL_HAVE_LOOPS
 520         bne     a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start
 521 #endif /* !XCHAL_HAVE_LOOPS */
 522 .backLoop2done:
 523         bbci.l  a4, 3, .Lback12
 524         # copy 8 bytes
 525         addi    a3, a3, -8
 526         l32i    a7, a3,  4
 527         l32i    a8, a3,  0
 528         addi    a5, a5, -8
 529         src_b   a6, a7, a6
 530         s32i    a6, a5,  4
 531         src_b   a7, a8, a7
 532         s32i    a7, a5,  0
 533         mov     a6, a8
 534 .Lback12:
 535         bbci.l  a4, 2, .Lback13
 536         # copy 4 bytes
 537         addi    a3, a3, -4
 538         l32i    a7, a3,  0
 539         addi    a5, a5, -4
 540         src_b   a6, a7, a6
 541         s32i    a6, a5,  0
 542         mov     a6, a7
 543 .Lback13:
 544 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
 545         add     a3, a3, a11     # readjust a3 with correct misalignment
 546 #endif
 547         bbsi.l  a4, 1, .Lback14
 548         bbsi.l  a4, 0, .Lback15
 549 .Lbackdone:
 550         retw
 551 .Lback14:
 552         # copy 2 bytes
 553         addi    a3, a3, -2
 554         l8ui    a6, a3,  0
 555         l8ui    a7, a3,  1
 556         addi    a5, a5, -2
 557         s8i     a6, a5,  0
 558         s8i     a7, a5,  1
 559         bbsi.l  a4, 0, .Lback15
 560         retw
 561 .Lback15:
 562         # copy 1 byte
 563         addi    a3, a3, -1
 564         addi    a5, a5, -1
 565         l8ui    a6, a3,  0
 566         s8i     a6, a5,  0
 567         retw
 568
 569 \f
 570 /*
 571  * Local Variables:
 572  * mode:fundamental
 573  * comment-start: "# "
 574  * comment-start-skip: "# *"
 575  * End:
 576  */