From 79379298f248fefa609041bd97f30274257c2a39 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Wed, 29 Sep 2021 19:22:33 +0200 Subject: [PATCH] riscv: optimized memmove When the destination buffer is before the source one, or when the buffers doesn't overlap, it's safe to use memcpy() instead, which is optimized to use a bigger data size possible. Signed-off-by: Matteo Croce --- arch/riscv/include/asm/string.h | 6 +- arch/riscv/kernel/riscv_ksyms.c | 2 - arch/riscv/lib/Makefile | 1 - arch/riscv/lib/memmove.S | 316 ---------------------------------------- arch/riscv/lib/string.c | 23 +++ 5 files changed, 26 insertions(+), 322 deletions(-) delete mode 100644 arch/riscv/lib/memmove.S diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h index 23fdcbf..0212046 100644 --- a/arch/riscv/include/asm/string.h +++ b/arch/riscv/include/asm/string.h @@ -16,10 +16,10 @@ extern asmlinkage void *__memset(void *, int, size_t); #define __HAVE_ARCH_MEMCPY extern void *memcpy(void *dest, const void *src, size_t count); extern void *__memcpy(void *dest, const void *src, size_t count); - #define __HAVE_ARCH_MEMMOVE -extern asmlinkage void *memmove(void *, const void *, size_t); -extern asmlinkage void *__memmove(void *, const void *, size_t); +extern void *memmove(void *dest, const void *src, size_t count); +extern void *__memmove(void *dest, const void *src, size_t count); + /* For those files which don't want to check by kasan. */ #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) #define memcpy(dst, src, len) __memcpy(dst, src, len) diff --git a/arch/riscv/kernel/riscv_ksyms.c b/arch/riscv/kernel/riscv_ksyms.c index 3f6d512..361565c 100644 --- a/arch/riscv/kernel/riscv_ksyms.c +++ b/arch/riscv/kernel/riscv_ksyms.c @@ -10,6 +10,4 @@ * Assembly functions that may be used (directly or indirectly) by modules */ EXPORT_SYMBOL(memset); -EXPORT_SYMBOL(memmove); EXPORT_SYMBOL(__memset); -EXPORT_SYMBOL(__memmove); diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile index 023d78f..9166b61 100644 --- a/arch/riscv/lib/Makefile +++ b/arch/riscv/lib/Makefile @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only lib-y += delay.o lib-y += memset.o -lib-y += memmove.o lib-$(CONFIG_MMU) += uaccess.o lib-$(CONFIG_64BIT) += tishift.o lib-y += string.o diff --git a/arch/riscv/lib/memmove.S b/arch/riscv/lib/memmove.S deleted file mode 100644 index e0609e1..0000000 --- a/arch/riscv/lib/memmove.S +++ /dev/null @@ -1,316 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2022 Michael T. Kloos - */ - -#include -#include - -SYM_FUNC_START(__memmove) -SYM_FUNC_START_WEAK(memmove) - /* - * Returns - * a0 - dest - * - * Parameters - * a0 - Inclusive first byte of dest - * a1 - Inclusive first byte of src - * a2 - Length of copy n - * - * Because the return matches the parameter register a0, - * we will not clobber or modify that register. - * - * Note: This currently only works on little-endian. - * To port to big-endian, reverse the direction of shifts - * in the 2 misaligned fixup copy loops. - */ - - /* Return if nothing to do */ - beq a0, a1, return_from_memmove - beqz a2, return_from_memmove - - /* - * Register Uses - * Forward Copy: a1 - Index counter of src - * Reverse Copy: a4 - Index counter of src - * Forward Copy: t3 - Index counter of dest - * Reverse Copy: t4 - Index counter of dest - * Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest - * Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest - * Both Copy Modes: t0 - Link / Temporary for load-store - * Both Copy Modes: t1 - Temporary for load-store - * Both Copy Modes: t2 - Temporary for load-store - * Both Copy Modes: a5 - dest to src alignment offset - * Both Copy Modes: a6 - Shift ammount - * Both Copy Modes: a7 - Inverse Shift ammount - * Both Copy Modes: a2 - Alternate breakpoint for unrolled loops - */ - - /* - * Solve for some register values now. - * Byte copy does not need t5 or t6. - */ - mv t3, a0 - add t4, a0, a2 - add a4, a1, a2 - - /* - * Byte copy if copying less than (2 * SZREG) bytes. This can - * cause problems with the bulk copy implementation and is - * small enough not to bother. - */ - andi t0, a2, -(2 * SZREG) - beqz t0, byte_copy - - /* - * Now solve for t5 and t6. - */ - andi t5, t3, -SZREG - andi t6, t4, -SZREG - /* - * If dest(Register t3) rounded down to the nearest naturally - * aligned SZREG address, does not equal dest, then add SZREG - * to find the low-bound of SZREG alignment in the dest memory - * region. Note that this could overshoot the dest memory - * region if n is less than SZREG. This is one reason why - * we always byte copy if n is less than SZREG. - * Otherwise, dest is already naturally aligned to SZREG. - */ - beq t5, t3, 1f - addi t5, t5, SZREG - 1: - - /* - * If the dest and src are co-aligned to SZREG, then there is - * no need for the full rigmarole of a full misaligned fixup copy. - * Instead, do a simpler co-aligned copy. - */ - xor t0, a0, a1 - andi t1, t0, (SZREG - 1) - beqz t1, coaligned_copy - /* Fall through to misaligned fixup copy */ - -misaligned_fixup_copy: - bltu a1, a0, misaligned_fixup_copy_reverse - -misaligned_fixup_copy_forward: - jal t0, byte_copy_until_aligned_forward - - andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */ - slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */ - sub a5, a1, t3 /* Find the difference between src and dest */ - andi a1, a1, -SZREG /* Align the src pointer */ - addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/ - - /* - * Compute The Inverse Shift - * a7 = XLEN - a6 = XLEN + -a6 - * 2s complement negation to find the negative: -a6 = ~a6 + 1 - * Add that to XLEN. XLEN = SZREG * 8. - */ - not a7, a6 - addi a7, a7, (SZREG * 8 + 1) - - /* - * Fix Misalignment Copy Loop - Forward - * load_val0 = load_ptr[0]; - * do { - * load_val1 = load_ptr[1]; - * store_ptr += 2; - * store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7}); - * - * if (store_ptr == {a2}) - * break; - * - * load_val0 = load_ptr[2]; - * load_ptr += 2; - * store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7}); - * - * } while (store_ptr != store_ptr_end); - * store_ptr = store_ptr_end; - */ - - REG_L t0, (0 * SZREG)(a1) - 1: - REG_L t1, (1 * SZREG)(a1) - addi t3, t3, (2 * SZREG) - srl t0, t0, a6 - sll t2, t1, a7 - or t2, t0, t2 - REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3) - - beq t3, a2, 2f - - REG_L t0, (2 * SZREG)(a1) - addi a1, a1, (2 * SZREG) - srl t1, t1, a6 - sll t2, t0, a7 - or t2, t1, t2 - REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3) - - bne t3, t6, 1b - 2: - mv t3, t6 /* Fix the dest pointer in case the loop was broken */ - - add a1, t3, a5 /* Restore the src pointer */ - j byte_copy_forward /* Copy any remaining bytes */ - -misaligned_fixup_copy_reverse: - jal t0, byte_copy_until_aligned_reverse - - andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */ - slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */ - sub a5, a4, t4 /* Find the difference between src and dest */ - andi a4, a4, -SZREG /* Align the src pointer */ - addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/ - - /* - * Compute The Inverse Shift - * a7 = XLEN - a6 = XLEN + -a6 - * 2s complement negation to find the negative: -a6 = ~a6 + 1 - * Add that to XLEN. XLEN = SZREG * 8. - */ - not a7, a6 - addi a7, a7, (SZREG * 8 + 1) - - /* - * Fix Misalignment Copy Loop - Reverse - * load_val1 = load_ptr[0]; - * do { - * load_val0 = load_ptr[-1]; - * store_ptr -= 2; - * store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7}); - * - * if (store_ptr == {a2}) - * break; - * - * load_val1 = load_ptr[-2]; - * load_ptr -= 2; - * store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7}); - * - * } while (store_ptr != store_ptr_end); - * store_ptr = store_ptr_end; - */ - - REG_L t1, ( 0 * SZREG)(a4) - 1: - REG_L t0, (-1 * SZREG)(a4) - addi t4, t4, (-2 * SZREG) - sll t1, t1, a7 - srl t2, t0, a6 - or t2, t1, t2 - REG_S t2, ( 1 * SZREG)(t4) - - beq t4, a2, 2f - - REG_L t1, (-2 * SZREG)(a4) - addi a4, a4, (-2 * SZREG) - sll t0, t0, a7 - srl t2, t1, a6 - or t2, t0, t2 - REG_S t2, ( 0 * SZREG)(t4) - - bne t4, t5, 1b - 2: - mv t4, t5 /* Fix the dest pointer in case the loop was broken */ - - add a4, t4, a5 /* Restore the src pointer */ - j byte_copy_reverse /* Copy any remaining bytes */ - -/* - * Simple copy loops for SZREG co-aligned memory locations. - * These also make calls to do byte copies for any unaligned - * data at their terminations. - */ -coaligned_copy: - bltu a1, a0, coaligned_copy_reverse - -coaligned_copy_forward: - jal t0, byte_copy_until_aligned_forward - - 1: - REG_L t1, ( 0 * SZREG)(a1) - addi a1, a1, SZREG - addi t3, t3, SZREG - REG_S t1, (-1 * SZREG)(t3) - bne t3, t6, 1b - - j byte_copy_forward /* Copy any remaining bytes */ - -coaligned_copy_reverse: - jal t0, byte_copy_until_aligned_reverse - - 1: - REG_L t1, (-1 * SZREG)(a4) - addi a4, a4, -SZREG - addi t4, t4, -SZREG - REG_S t1, ( 0 * SZREG)(t4) - bne t4, t5, 1b - - j byte_copy_reverse /* Copy any remaining bytes */ - -/* - * These are basically sub-functions within the function. They - * are used to byte copy until the dest pointer is in alignment. - * At which point, a bulk copy method can be used by the - * calling code. These work on the same registers as the bulk - * copy loops. Therefore, the register values can be picked - * up from where they were left and we avoid code duplication - * without any overhead except the call in and return jumps. - */ -byte_copy_until_aligned_forward: - beq t3, t5, 2f - 1: - lb t1, 0(a1) - addi a1, a1, 1 - addi t3, t3, 1 - sb t1, -1(t3) - bne t3, t5, 1b - 2: - jalr zero, 0x0(t0) /* Return to multibyte copy loop */ - -byte_copy_until_aligned_reverse: - beq t4, t6, 2f - 1: - lb t1, -1(a4) - addi a4, a4, -1 - addi t4, t4, -1 - sb t1, 0(t4) - bne t4, t6, 1b - 2: - jalr zero, 0x0(t0) /* Return to multibyte copy loop */ - -/* - * Simple byte copy loops. - * These will byte copy until they reach the end of data to copy. - * At that point, they will call to return from memmove. - */ -byte_copy: - bltu a1, a0, byte_copy_reverse - -byte_copy_forward: - beq t3, t4, 2f - 1: - lb t1, 0(a1) - addi a1, a1, 1 - addi t3, t3, 1 - sb t1, -1(t3) - bne t3, t4, 1b - 2: - ret - -byte_copy_reverse: - beq t4, t3, 2f - 1: - lb t1, -1(a4) - addi a4, a4, -1 - addi t4, t4, -1 - sb t1, 0(t4) - bne t4, t3, 1b - 2: - -return_from_memmove: - ret - -SYM_FUNC_END(memmove) -SYM_FUNC_END(__memmove) diff --git a/arch/riscv/lib/string.c b/arch/riscv/lib/string.c index 32509ab..bd52581 100644 --- a/arch/riscv/lib/string.c +++ b/arch/riscv/lib/string.c @@ -89,3 +89,26 @@ EXPORT_SYMBOL(__memcpy); void *memcpy(void *dest, const void *src, size_t count) __weak __alias(__memcpy); EXPORT_SYMBOL(memcpy); + +/* + * Simply check if the buffer overlaps an call memcpy() in case, + * otherwise do a simple one byte at time backward copy. + */ +void *__memmove(void *dest, const void *src, size_t count) +{ + if (dest < src || src + count <= dest) + return __memcpy(dest, src, count); + + if (dest > src) { + const char *s = src + count; + char *tmp = dest + count; + + while (count--) + *--tmp = *--s; + } + return dest; +} +EXPORT_SYMBOL(__memmove); + +void *memmove(void *dest, const void *src, size_t count) __weak __alias(__memmove); +EXPORT_SYMBOL(memmove); -- 2.7.4