arch/riscv/lib/memcpy.S

   1 /* SPDX-License-Identifier: GPL-2.0-only */
   2 /*
   3  * Copyright (C) 2013 Regents of the University of California
   4  */
   5
   6 #include <linux/linkage.h>
   7 #include <asm/asm.h>
   8
   9 /* void *memcpy(void *, const void *, size_t) */
  10 ENTRY(__memcpy)
  11 WEAK(memcpy)
  12         /* Save for return value */
  13         mv      t6, a0
  14
  15         /*
  16          * Register allocation for code below:
  17          * a0 - start of uncopied dst
  18          * a1 - start of uncopied src
  19          * t0 - end of uncopied dst
  20          */
  21         add     t0, a0, a2
  22
  23         /*
  24          * Use bytewise copy if too small.
  25          *
  26          * This threshold must be at least 2*SZREG to ensure at least one
  27          * wordwise copy is performed. It is chosen to be 16 because it will
  28          * save at least 7 iterations of bytewise copy, which pays off the
  29          * fixed overhead.
  30          */
  31         li      a3, 16
  32         bltu    a2, a3, .Lbyte_copy_tail
  33
  34         /*
  35          * Bytewise copy first to align a0 to word boundary.
  36          */
  37         addi    a2, a0, SZREG-1
  38         andi    a2, a2, ~(SZREG-1)
  39         beq     a0, a2, 2f
  40 1:
  41         lb      a5, 0(a1)
  42         addi    a1, a1, 1
  43         sb      a5, 0(a0)
  44         addi    a0, a0, 1
  45         bne     a0, a2, 1b
  46 2:
  47
  48         /*
  49          * Now a0 is word-aligned. If a1 is also word aligned, we could perform
  50          * aligned word-wise copy. Otherwise we need to perform misaligned
  51          * word-wise copy.
  52          */
  53         andi    a3, a1, SZREG-1
  54         bnez    a3, .Lmisaligned_word_copy
  55
  56         /* Unrolled wordwise copy */
  57         addi    t0, t0, -(16*SZREG-1)
  58         bgeu    a0, t0, 2f
  59 1:
  60         REG_L   a2,        0(a1)
  61         REG_L   a3,    SZREG(a1)
  62         REG_L   a4,  2*SZREG(a1)
  63         REG_L   a5,  3*SZREG(a1)
  64         REG_L   a6,  4*SZREG(a1)
  65         REG_L   a7,  5*SZREG(a1)
  66         REG_L   t1,  6*SZREG(a1)
  67         REG_L   t2,  7*SZREG(a1)
  68         REG_L   t3,  8*SZREG(a1)
  69         REG_L   t4,  9*SZREG(a1)
  70         REG_L   t5, 10*SZREG(a1)
  71         REG_S   a2,        0(a0)
  72         REG_S   a3,    SZREG(a0)
  73         REG_S   a4,  2*SZREG(a0)
  74         REG_S   a5,  3*SZREG(a0)
  75         REG_S   a6,  4*SZREG(a0)
  76         REG_S   a7,  5*SZREG(a0)
  77         REG_S   t1,  6*SZREG(a0)
  78         REG_S   t2,  7*SZREG(a0)
  79         REG_S   t3,  8*SZREG(a0)
  80         REG_S   t4,  9*SZREG(a0)
  81         REG_S   t5, 10*SZREG(a0)
  82         REG_L   a2, 11*SZREG(a1)
  83         REG_L   a3, 12*SZREG(a1)
  84         REG_L   a4, 13*SZREG(a1)
  85         REG_L   a5, 14*SZREG(a1)
  86         REG_L   a6, 15*SZREG(a1)
  87         addi    a1, a1, 16*SZREG
  88         REG_S   a2, 11*SZREG(a0)
  89         REG_S   a3, 12*SZREG(a0)
  90         REG_S   a4, 13*SZREG(a0)
  91         REG_S   a5, 14*SZREG(a0)
  92         REG_S   a6, 15*SZREG(a0)
  93         addi    a0, a0, 16*SZREG
  94         bltu    a0, t0, 1b
  95 2:
  96         /* Post-loop increment by 16*SZREG-1 and pre-loop decrement by SZREG-1 */
  97         addi    t0, t0, 15*SZREG
  98
  99         /* Wordwise copy */
 100         bgeu    a0, t0, 2f
 101 1:
 102         REG_L   a5, 0(a1)
 103         addi    a1, a1, SZREG
 104         REG_S   a5, 0(a0)
 105         addi    a0, a0, SZREG
 106         bltu    a0, t0, 1b
 107 2:
 108         addi    t0, t0, SZREG-1
 109
 110 .Lbyte_copy_tail:
 111         /*
 112          * Bytewise copy anything left.
 113          */
 114         beq     a0, t0, 2f
 115 1:
 116         lb      a5, 0(a1)
 117         addi    a1, a1, 1
 118         sb      a5, 0(a0)
 119         addi    a0, a0, 1
 120         bne     a0, t0, 1b
 121 2:
 122
 123         mv      a0, t6
 124         ret
 125
 126 .Lmisaligned_word_copy:
 127         /*
 128          * Misaligned word-wise copy.
 129          * For misaligned copy we still perform word-wise copy, but we need to
 130          * use the value fetched from the previous iteration and do some shifts.
 131          * This is safe because we wouldn't access more words than necessary.
 132          */
 133
 134         /* Calculate shifts */
 135         slli    t3, a3, 3
 136         sub     t4, x0, t3 /* negate is okay as shift will only look at LSBs */
 137
 138         /* Load the initial value and align a1 */
 139         andi    a1, a1, ~(SZREG-1)
 140         REG_L   a5, 0(a1)
 141
 142         addi    t0, t0, -(SZREG-1)
 143         /* At least one iteration will be executed here, no check */
 144 1:
 145         srl     a4, a5, t3
 146         REG_L   a5, SZREG(a1)
 147         addi    a1, a1, SZREG
 148         sll     a2, a5, t4
 149         or      a2, a2, a4
 150         REG_S   a2, 0(a0)
 151         addi    a0, a0, SZREG
 152         bltu    a0, t0, 1b
 153
 154         /* Update pointers to correct value */
 155         addi    t0, t0, SZREG-1
 156         add     a1, a1, a3
 157
 158         j       .Lbyte_copy_tail
 159 END(__memcpy)