riscv: Fix memmove and optimise memcpy when misalign

author Bin Meng <bmeng.cn@gmail.com>

Thu, 13 May 2021 08:46:17 +0000 (16:46 +0800)

committer Leo Yu-Chi Liang <ycliang@andestech.com>

Mon, 17 May 2021 08:47:33 +0000 (16:47 +0800)
author Bin Meng <bmeng.cn@gmail.com>
Thu, 13 May 2021 08:46:17 +0000 (16:46 +0800)
committer Leo Yu-Chi Liang <ycliang@andestech.com>
Mon, 17 May 2021 08:47:33 +0000 (16:47 +0800)
diff --git a/arch/riscv/lib/memcpy.S b/arch/riscv/lib/memcpy.S

index 51ab716253fa3c9939a95b8721b7524099824e63..00672c19ad9b10c40d9ef86629c2b278af8cf137 100644 (file)
--- a/arch/riscv/lib/memcpy.S
+++ b/arch/riscv/lib/memcpy.S
@@ -9,100 +9,151 @@
  /* void *memcpy(void *, const void *, size_t) */
  ENTRY(__memcpy)
  WEAK(memcpy)
-       move t6, a0  /* Preserve return value */
+       /* Save for return value */
+       mv      t6, a0
  
-       /* Defer to byte-oriented copy for small sizes */
-       sltiu a3, a2, 128
-       bnez a3, 4f
-       /* Use word-oriented copy only if low-order bits match */
-       andi a3, t6, SZREG-1
-       andi a4, a1, SZREG-1
-       bne a3, a4, 4f
+       /*
+        * Register allocation for code below:
+        * a0 - start of uncopied dst
+        * a1 - start of uncopied src
+        * t0 - end of uncopied dst
+        */
+       add     t0, a0, a2
  
-       beqz a3, 2f  /* Skip if already aligned */
         /*
-        * Round to nearest double word-aligned address
-        * greater than or equal to start address
+        * Use bytewise copy if too small.
+        *
+        * This threshold must be at least 2*SZREG to ensure at least one
+        * wordwise copy is performed. It is chosen to be 16 because it will
+        * save at least 7 iterations of bytewise copy, which pays off the
+        * fixed overhead.
          */
-       andi a3, a1, ~(SZREG-1)
-       addi a3, a3, SZREG
-       /* Handle initial misalignment */
-       sub a4, a3, a1
+       li      a3, 16
+       bltu    a2, a3, .Lbyte_copy_tail
+
+       /*
+        * Bytewise copy first to align a0 to word boundary.
+        */
+       addi    a2, a0, SZREG-1
+       andi    a2, a2, ~(SZREG-1)
+       beq     a0, a2, 2f
  1:
-       lb a5, 0(a1)
-       addi a1, a1, 1
-       sb a5, 0(t6)
-       addi t6, t6, 1
-       bltu a1, a3, 1b
-       sub a2, a2, a4  /* Update count */
+       lb      a5, 0(a1)
+       addi    a1, a1, 1
+       sb      a5, 0(a0)
+       addi    a0, a0, 1
+       bne     a0, a2, 1b
+2:
+
+       /*
+        * Now a0 is word-aligned. If a1 is also word aligned, we could perform
+        * aligned word-wise copy. Otherwise we need to perform misaligned
+        * word-wise copy.
+        */
+       andi    a3, a1, SZREG-1
+       bnez    a3, .Lmisaligned_word_copy
  
+       /* Unrolled wordwise copy */
+       addi    t0, t0, -(16*SZREG-1)
+       bgeu    a0, t0, 2f
+1:
+       REG_L   a2,        0(a1)
+       REG_L   a3,    SZREG(a1)
+       REG_L   a4,  2*SZREG(a1)
+       REG_L   a5,  3*SZREG(a1)
+       REG_L   a6,  4*SZREG(a1)
+       REG_L   a7,  5*SZREG(a1)
+       REG_L   t1,  6*SZREG(a1)
+       REG_L   t2,  7*SZREG(a1)
+       REG_L   t3,  8*SZREG(a1)
+       REG_L   t4,  9*SZREG(a1)
+       REG_L   t5, 10*SZREG(a1)
+       REG_S   a2,        0(a0)
+       REG_S   a3,    SZREG(a0)
+       REG_S   a4,  2*SZREG(a0)
+       REG_S   a5,  3*SZREG(a0)
+       REG_S   a6,  4*SZREG(a0)
+       REG_S   a7,  5*SZREG(a0)
+       REG_S   t1,  6*SZREG(a0)
+       REG_S   t2,  7*SZREG(a0)
+       REG_S   t3,  8*SZREG(a0)
+       REG_S   t4,  9*SZREG(a0)
+       REG_S   t5, 10*SZREG(a0)
+       REG_L   a2, 11*SZREG(a1)
+       REG_L   a3, 12*SZREG(a1)
+       REG_L   a4, 13*SZREG(a1)
+       REG_L   a5, 14*SZREG(a1)
+       REG_L   a6, 15*SZREG(a1)
+       addi    a1, a1, 16*SZREG
+       REG_S   a2, 11*SZREG(a0)
+       REG_S   a3, 12*SZREG(a0)
+       REG_S   a4, 13*SZREG(a0)
+       REG_S   a5, 14*SZREG(a0)
+       REG_S   a6, 15*SZREG(a0)
+       addi    a0, a0, 16*SZREG
+       bltu    a0, t0, 1b
  2:
-       andi a4, a2, ~((16*SZREG)-1)
-       beqz a4, 4f
-       add a3, a1, a4
-3:
-       REG_L a4,       0(a1)
-       REG_L a5,   SZREG(a1)
-       REG_L a6, 2*SZREG(a1)
-       REG_L a7, 3*SZREG(a1)
-       REG_L t0, 4*SZREG(a1)
-       REG_L t1, 5*SZREG(a1)
-       REG_L t2, 6*SZREG(a1)
-       REG_L t3, 7*SZREG(a1)
-       REG_L t4, 8*SZREG(a1)
-       REG_L t5, 9*SZREG(a1)
-       REG_S a4,       0(t6)
-       REG_S a5,   SZREG(t6)
-       REG_S a6, 2*SZREG(t6)
-       REG_S a7, 3*SZREG(t6)
-       REG_S t0, 4*SZREG(t6)
-       REG_S t1, 5*SZREG(t6)
-       REG_S t2, 6*SZREG(t6)
-       REG_S t3, 7*SZREG(t6)
-       REG_S t4, 8*SZREG(t6)
-       REG_S t5, 9*SZREG(t6)
-       REG_L a4, 10*SZREG(a1)
-       REG_L a5, 11*SZREG(a1)
-       REG_L a6, 12*SZREG(a1)
-       REG_L a7, 13*SZREG(a1)
-       REG_L t0, 14*SZREG(a1)
-       REG_L t1, 15*SZREG(a1)
-       addi a1, a1, 16*SZREG
-       REG_S a4, 10*SZREG(t6)
-       REG_S a5, 11*SZREG(t6)
-       REG_S a6, 12*SZREG(t6)
-       REG_S a7, 13*SZREG(t6)
-       REG_S t0, 14*SZREG(t6)
-       REG_S t1, 15*SZREG(t6)
-       addi t6, t6, 16*SZREG
-       bltu a1, a3, 3b
-       andi a2, a2, (16*SZREG)-1  /* Update count */
-
-4:
-       /* Handle trailing misalignment */
-       beqz a2, 6f
-       add a3, a1, a2
-
-       /* Use word-oriented copy if co-aligned to word boundary */
-       or a5, a1, t6
-       or a5, a5, a3
-       andi a5, a5, 3
-       bnez a5, 5f
-7:
-       lw a4, 0(a1)
-       addi a1, a1, 4
-       sw a4, 0(t6)
-       addi t6, t6, 4
-       bltu a1, a3, 7b
+       /* Post-loop increment by 16*SZREG-1 and pre-loop decrement by SZREG-1 */
+       addi    t0, t0, 15*SZREG
  
-       ret
+       /* Wordwise copy */
+       bgeu    a0, t0, 2f
+1:
+       REG_L   a5, 0(a1)
+       addi    a1, a1, SZREG
+       REG_S   a5, 0(a0)
+       addi    a0, a0, SZREG
+       bltu    a0, t0, 1b
+2:
+       addi    t0, t0, SZREG-1
  
-5:
-       lb a4, 0(a1)
-       addi a1, a1, 1
-       sb a4, 0(t6)
-       addi t6, t6, 1
-       bltu a1, a3, 5b
-6:
+.Lbyte_copy_tail:
+       /*
+        * Bytewise copy anything left.
+        */
+       beq     a0, t0, 2f
+1:
+       lb      a5, 0(a1)
+       addi    a1, a1, 1
+       sb      a5, 0(a0)
+       addi    a0, a0, 1
+       bne     a0, t0, 1b
+2:
+
+       mv      a0, t6
         ret
+
+.Lmisaligned_word_copy:
+       /*
+        * Misaligned word-wise copy.
+        * For misaligned copy we still perform word-wise copy, but we need to
+        * use the value fetched from the previous iteration and do some shifts.
+        * This is safe because we wouldn't access more words than necessary.
+        */
+
+       /* Calculate shifts */
+       slli    t3, a3, 3
+       sub     t4, x0, t3 /* negate is okay as shift will only look at LSBs */
+
+       /* Load the initial value and align a1 */
+       andi    a1, a1, ~(SZREG-1)
+       REG_L   a5, 0(a1)
+
+       addi    t0, t0, -(SZREG-1)
+       /* At least one iteration will be executed here, no check */
+1:
+       srl     a4, a5, t3
+       REG_L   a5, SZREG(a1)
+       addi    a1, a1, SZREG
+       sll     a2, a5, t4
+       or      a2, a2, a4
+       REG_S   a2, 0(a0)
+       addi    a0, a0, SZREG
+       bltu    a0, t0, 1b
+
+       /* Update pointers to correct value */
+       addi    t0, t0, SZREG-1
+       add     a1, a1, a3
+
+       j       .Lbyte_copy_tail
  END(__memcpy)
diff --git a/arch/riscv/lib/memmove.S b/arch/riscv/lib/memmove.S

index 07d1d2152ba5ce70f73029b1b28872b5f54f4483..fbe6701dbe4a435aa808b507958c146b845f8d36 100644 (file)
--- a/arch/riscv/lib/memmove.S
+++ b/arch/riscv/lib/memmove.S
@@ -5,60 +5,124 @@
  
  ENTRY(__memmove)
  WEAK(memmove)
-        move    t0, a0
-        move    t1, a1
-
-        beq     a0, a1, exit_memcpy
-        beqz    a2, exit_memcpy
-        srli    t2, a2, 0x2
-
-        slt     t3, a0, a1
-        beqz    t3, do_reverse
-
-        andi    a2, a2, 0x3
-        li      t4, 1
-        beqz    t2, byte_copy
-
-word_copy:
-        lw      t3, 0(a1)
-        addi    t2, t2, -1
-        addi    a1, a1, 4
-        sw      t3, 0(a0)
-        addi    a0, a0, 4
-        bnez    t2, word_copy
-        beqz    a2, exit_memcpy
-        j       byte_copy
-
-do_reverse:
-        add     a0, a0, a2
-        add     a1, a1, a2
-        andi    a2, a2, 0x3
-        li      t4, -1
-        beqz    t2, reverse_byte_copy
-
-reverse_word_copy:
-        addi    a1, a1, -4
-        addi    t2, t2, -1
-        lw      t3, 0(a1)
-        addi    a0, a0, -4
-        sw      t3, 0(a0)
-        bnez    t2, reverse_word_copy
-        beqz    a2, exit_memcpy
-
-reverse_byte_copy:
-        addi    a0, a0, -1
-        addi    a1, a1, -1
-
-byte_copy:
-        lb      t3, 0(a1)
-        addi    a2, a2, -1
-        sb      t3, 0(a0)
-        add     a1, a1, t4
-        add     a0, a0, t4
-        bnez    a2, byte_copy
-
-exit_memcpy:
-        move a0, t0
-        move a1, t1
-        ret
+       /*
+        * Here we determine if forward copy is possible. Forward copy is
+        * preferred to backward copy as it is more cache friendly.
+        *
+        * If a0 >= a1, t0 gives their distance, if t0 >= a2 then we can
+        *   copy forward.
+        * If a0 < a1, we can always copy forward. This will make t0 negative,
+        *   so a *unsigned* comparison will always have t0 >= a2.
+        *
+        * For forward copy we just delegate the task to memcpy.
+        */
+       sub     t0, a0, a1
+       bltu    t0, a2, 1f
+       tail    __memcpy
+1:
+
+       /*
+        * Register allocation for code below:
+        * a0 - end of uncopied dst
+        * a1 - end of uncopied src
+        * t0 - start of uncopied dst
+        */
+       mv      t0, a0
+       add     a0, a0, a2
+       add     a1, a1, a2
+
+       /*
+        * Use bytewise copy if too small.
+        *
+        * This threshold must be at least 2*SZREG to ensure at least one
+        * wordwise copy is performed. It is chosen to be 16 because it will
+        * save at least 7 iterations of bytewise copy, which pays off the
+        * fixed overhead.
+        */
+       li      a3, 16
+       bltu    a2, a3, .Lbyte_copy_tail
+
+       /*
+        * Bytewise copy first to align t0 to word boundary.
+        */
+       andi    a2, a0, ~(SZREG-1)
+       beq     a0, a2, 2f
+1:
+       addi    a1, a1, -1
+       lb      a5, 0(a1)
+       addi    a0, a0, -1
+       sb      a5, 0(a0)
+       bne     a0, a2, 1b
+2:
+
+       /*
+        * Now a0 is word-aligned. If a1 is also word aligned, we could perform
+        * aligned word-wise copy. Otherwise we need to perform misaligned
+        * word-wise copy.
+        */
+       andi    a3, a1, SZREG-1
+       bnez    a3, .Lmisaligned_word_copy
+
+       /* Wordwise copy */
+       addi    t0, t0, SZREG-1
+       bleu    a0, t0, 2f
+1:
+       addi    a1, a1, -SZREG
+       REG_L   a5, 0(a1)
+       addi    a0, a0, -SZREG
+       REG_S   a5, 0(a0)
+       bgtu    a0, t0, 1b
+2:
+       addi    t0, t0, -(SZREG-1)
+
+.Lbyte_copy_tail:
+       /*
+        * Bytewise copy anything left.
+        */
+       beq     a0, t0, 2f
+1:
+       addi    a1, a1, -1
+       lb      a5, 0(a1)
+       addi    a0, a0, -1
+       sb      a5, 0(a0)
+       bne     a0, t0, 1b
+2:
+
+       mv      a0, t0
+       ret
+
+.Lmisaligned_word_copy:
+       /*
+        * Misaligned word-wise copy.
+        * For misaligned copy we still perform word-wise copy, but we need to
+        * use the value fetched from the previous iteration and do some shifts.
+        * This is safe because we wouldn't access more words than necessary.
+        */
+
+       /* Calculate shifts */
+       slli    t3, a3, 3
+       sub     t4, x0, t3 /* negate is okay as shift will only look at LSBs */
+
+       /* Load the initial value and align a1 */
+       andi    a1, a1, ~(SZREG-1)
+       REG_L   a5, 0(a1)
+
+       addi    t0, t0, SZREG-1
+       /* At least one iteration will be executed here, no check */
+1:
+       sll     a4, a5, t4
+       addi    a1, a1, -SZREG
+       REG_L   a5, 0(a1)
+       srl     a2, a5, t3
+       or      a2, a2, a4
+       addi    a0, a0, -SZREG
+       REG_S   a2, 0(a0)
+       bgtu    a0, t0, 1b
+
+       /* Update pointers to correct value */
+       addi    t0, t0, -(SZREG-1)
+       add     a1, a1, a3
+
+       j       .Lbyte_copy_tail
+
  END(__memmove)
author	Bin Meng <bmeng.cn@gmail.com>
	Thu, 13 May 2021 08:46:17 +0000 (16:46 +0800)
committer	Leo Yu-Chi Liang <ycliang@andestech.com>
	Mon, 17 May 2021 08:47:33 +0000 (16:47 +0800)
arch/riscv/lib/memcpy.S		patch \| blob \| history
arch/riscv/lib/memmove.S		patch \| blob \| history