arch/riscv/lib/memcpy.S

   1 /* SPDX-License-Identifier: GPL-2.0-only */
   2 /*
   3  * Copyright (C) 2013 Regents of the University of California
   4  */
   5
   6 #include <linux/linkage.h>
   7 #include <asm/asm.h>
   8
   9 /* void *memcpy(void *, const void *, size_t) */
  10 ENTRY(__memcpy)
  11 WEAK(memcpy)
  12         beq     a0, a1, .copy_end
  13         /* Save for return value */
  14         mv      t6, a0
  15
  16         /*
  17          * Register allocation for code below:
  18          * a0 - start of uncopied dst
  19          * a1 - start of uncopied src
  20          * t0 - end of uncopied dst
  21          */
  22         add     t0, a0, a2
  23
  24         /*
  25          * Use bytewise copy if too small.
  26          *
  27          * This threshold must be at least 2*SZREG to ensure at least one
  28          * wordwise copy is performed. It is chosen to be 16 because it will
  29          * save at least 7 iterations of bytewise copy, which pays off the
  30          * fixed overhead.
  31          */
  32         li      a3, 16
  33         bltu    a2, a3, .Lbyte_copy_tail
  34
  35         /*
  36          * Bytewise copy first to align a0 to word boundary.
  37          */
  38         addi    a2, a0, SZREG-1
  39         andi    a2, a2, ~(SZREG-1)
  40         beq     a0, a2, 2f
  41 1:
  42         lb      a5, 0(a1)
  43         addi    a1, a1, 1
  44         sb      a5, 0(a0)
  45         addi    a0, a0, 1
  46         bne     a0, a2, 1b
  47 2:
  48
  49         /*
  50          * Now a0 is word-aligned. If a1 is also word aligned, we could perform
  51          * aligned word-wise copy. Otherwise we need to perform misaligned
  52          * word-wise copy.
  53          */
  54         andi    a3, a1, SZREG-1
  55         bnez    a3, .Lmisaligned_word_copy
  56
  57         /* Unrolled wordwise copy */
  58         addi    t0, t0, -(16*SZREG-1)
  59         bgeu    a0, t0, 2f
  60 1:
  61         REG_L   a2,        0(a1)
  62         REG_L   a3,    SZREG(a1)
  63         REG_L   a4,  2*SZREG(a1)
  64         REG_L   a5,  3*SZREG(a1)
  65         REG_L   a6,  4*SZREG(a1)
  66         REG_L   a7,  5*SZREG(a1)
  67         REG_L   t1,  6*SZREG(a1)
  68         REG_L   t2,  7*SZREG(a1)
  69         REG_L   t3,  8*SZREG(a1)
  70         REG_L   t4,  9*SZREG(a1)
  71         REG_L   t5, 10*SZREG(a1)
  72         REG_S   a2,        0(a0)
  73         REG_S   a3,    SZREG(a0)
  74         REG_S   a4,  2*SZREG(a0)
  75         REG_S   a5,  3*SZREG(a0)
  76         REG_S   a6,  4*SZREG(a0)
  77         REG_S   a7,  5*SZREG(a0)
  78         REG_S   t1,  6*SZREG(a0)
  79         REG_S   t2,  7*SZREG(a0)
  80         REG_S   t3,  8*SZREG(a0)
  81         REG_S   t4,  9*SZREG(a0)
  82         REG_S   t5, 10*SZREG(a0)
  83         REG_L   a2, 11*SZREG(a1)
  84         REG_L   a3, 12*SZREG(a1)
  85         REG_L   a4, 13*SZREG(a1)
  86         REG_L   a5, 14*SZREG(a1)
  87         REG_L   a6, 15*SZREG(a1)
  88         addi    a1, a1, 16*SZREG
  89         REG_S   a2, 11*SZREG(a0)
  90         REG_S   a3, 12*SZREG(a0)
  91         REG_S   a4, 13*SZREG(a0)
  92         REG_S   a5, 14*SZREG(a0)
  93         REG_S   a6, 15*SZREG(a0)
  94         addi    a0, a0, 16*SZREG
  95         bltu    a0, t0, 1b
  96 2:
  97         /* Post-loop increment by 16*SZREG-1 and pre-loop decrement by SZREG-1 */
  98         addi    t0, t0, 15*SZREG
  99
 100         /* Wordwise copy */
 101         bgeu    a0, t0, 2f
 102 1:
 103         REG_L   a5, 0(a1)
 104         addi    a1, a1, SZREG
 105         REG_S   a5, 0(a0)
 106         addi    a0, a0, SZREG
 107         bltu    a0, t0, 1b
 108 2:
 109         addi    t0, t0, SZREG-1
 110
 111 .Lbyte_copy_tail:
 112         /*
 113          * Bytewise copy anything left.
 114          */
 115         beq     a0, t0, 2f
 116 1:
 117         lb      a5, 0(a1)
 118         addi    a1, a1, 1
 119         sb      a5, 0(a0)
 120         addi    a0, a0, 1
 121         bne     a0, t0, 1b
 122 2:
 123
 124         mv      a0, t6
 125 .copy_end:
 126         ret
 127
 128 .Lmisaligned_word_copy:
 129         /*
 130          * Misaligned word-wise copy.
 131          * For misaligned copy we still perform word-wise copy, but we need to
 132          * use the value fetched from the previous iteration and do some shifts.
 133          * This is safe because we wouldn't access more words than necessary.
 134          */
 135
 136         /* Calculate shifts */
 137         slli    t3, a3, 3
 138         sub     t4, x0, t3 /* negate is okay as shift will only look at LSBs */
 139
 140         /* Load the initial value and align a1 */
 141         andi    a1, a1, ~(SZREG-1)
 142         REG_L   a5, 0(a1)
 143
 144         addi    t0, t0, -(SZREG-1)
 145         /* At least one iteration will be executed here, no check */
 146 1:
 147         srl     a4, a5, t3
 148         REG_L   a5, SZREG(a1)
 149         addi    a1, a1, SZREG
 150         sll     a2, a5, t4
 151         or      a2, a2, a4
 152         REG_S   a2, 0(a0)
 153         addi    a0, a0, SZREG
 154         bltu    a0, t0, 1b
 155
 156         /* Update pointers to correct value */
 157         addi    t0, t0, SZREG-1
 158         add     a1, a1, a3
 159
 160         j       .Lbyte_copy_tail
 161 END(__memcpy)