From: Matteo Croce Date: Wed, 30 Mar 2022 12:07:14 +0000 (+0800) Subject: csky: Add C based string functions X-Git-Tag: v6.1-rc5~1326^2~7 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=e4df2d5e852a7d24df3672ae9951eb79e179be08;p=platform%2Fkernel%2Flinux-starfive.git csky: Add C based string functions Try to access RAM with the largest bit width possible, but without doing unaligned accesses. A further improvement could be to use multiple read and writes as the assembly version was trying to do. Tested on a BeagleV Starlight with a SiFive U74 core, where the improvement is noticeable. Signed-off-by: Matteo Croce Co-developed-by: Guo Ren Signed-off-by: Guo Ren --- diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 75ef866..21d72b0 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -320,6 +320,14 @@ config HOTPLUG_CPU controlled through /sys/devices/system/cpu/cpu1/hotplug/target. Say N if you want to disable CPU hotplug. + +config HAVE_EFFICIENT_UNALIGNED_STRING_OPS + bool "Enable EFFICIENT_UNALIGNED_STRING_OPS for abiv2" + depends on CPU_CK807 || CPU_CK810 || CPU_CK860 + help + Say Y here to enable EFFICIENT_UNALIGNED_STRING_OPS. Some CPU models could + deal with unaligned access by hardware. + endmenu source "arch/csky/Kconfig.platforms" diff --git a/arch/csky/abiv1/Makefile b/arch/csky/abiv1/Makefile index 601ce3b..a4b2ade 100644 --- a/arch/csky/abiv1/Makefile +++ b/arch/csky/abiv1/Makefile @@ -4,5 +4,3 @@ obj-y += bswapdi.o obj-y += bswapsi.o obj-y += cacheflush.o obj-y += mmap.o -obj-y += memcpy.o -obj-y += strksyms.o diff --git a/arch/csky/abiv1/memcpy.S b/arch/csky/abiv1/memcpy.S deleted file mode 100644 index 5078eb5..0000000 --- a/arch/csky/abiv1/memcpy.S +++ /dev/null @@ -1,347 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. - -#include - -.macro GET_FRONT_BITS rx y -#ifdef __cskyLE__ - lsri \rx, \y -#else - lsli \rx, \y -#endif -.endm - -.macro GET_AFTER_BITS rx y -#ifdef __cskyLE__ - lsli \rx, \y -#else - lsri \rx, \y -#endif -.endm - -/* void *memcpy(void *dest, const void *src, size_t n); */ -ENTRY(memcpy) - mov r7, r2 - cmplti r4, 4 - bt .L_copy_by_byte - mov r6, r2 - andi r6, 3 - cmpnei r6, 0 - jbt .L_dest_not_aligned - mov r6, r3 - andi r6, 3 - cmpnei r6, 0 - jbt .L_dest_aligned_but_src_not_aligned -.L0: - cmplti r4, 16 - jbt .L_aligned_and_len_less_16bytes - subi sp, 8 - stw r8, (sp, 0) -.L_aligned_and_len_larger_16bytes: - ldw r1, (r3, 0) - ldw r5, (r3, 4) - ldw r8, (r3, 8) - stw r1, (r7, 0) - ldw r1, (r3, 12) - stw r5, (r7, 4) - stw r8, (r7, 8) - stw r1, (r7, 12) - subi r4, 16 - addi r3, 16 - addi r7, 16 - cmplti r4, 16 - jbf .L_aligned_and_len_larger_16bytes - ldw r8, (sp, 0) - addi sp, 8 - cmpnei r4, 0 - jbf .L_return - -.L_aligned_and_len_less_16bytes: - cmplti r4, 4 - bt .L_copy_by_byte -.L1: - ldw r1, (r3, 0) - stw r1, (r7, 0) - subi r4, 4 - addi r3, 4 - addi r7, 4 - cmplti r4, 4 - jbf .L1 - br .L_copy_by_byte - -.L_return: - rts - -.L_copy_by_byte: /* len less than 4 bytes */ - cmpnei r4, 0 - jbf .L_return -.L4: - ldb r1, (r3, 0) - stb r1, (r7, 0) - addi r3, 1 - addi r7, 1 - decne r4 - jbt .L4 - rts - -/* - * If dest is not aligned, just copying some bytes makes the dest align. - * Afther that, we judge whether the src is aligned. - */ -.L_dest_not_aligned: - mov r5, r3 - rsub r5, r5, r7 - abs r5, r5 - cmplt r5, r4 - bt .L_copy_by_byte - mov r5, r7 - sub r5, r3 - cmphs r5, r4 - bf .L_copy_by_byte - mov r5, r6 -.L5: - ldb r1, (r3, 0) /* makes the dest align. */ - stb r1, (r7, 0) - addi r5, 1 - subi r4, 1 - addi r3, 1 - addi r7, 1 - cmpnei r5, 4 - jbt .L5 - cmplti r4, 4 - jbt .L_copy_by_byte - mov r6, r3 /* judge whether the src is aligned. */ - andi r6, 3 - cmpnei r6, 0 - jbf .L0 - -/* Judge the number of misaligned, 1, 2, 3? */ -.L_dest_aligned_but_src_not_aligned: - mov r5, r3 - rsub r5, r5, r7 - abs r5, r5 - cmplt r5, r4 - bt .L_copy_by_byte - bclri r3, 0 - bclri r3, 1 - ldw r1, (r3, 0) - addi r3, 4 - cmpnei r6, 2 - bf .L_dest_aligned_but_src_not_aligned_2bytes - cmpnei r6, 3 - bf .L_dest_aligned_but_src_not_aligned_3bytes - -.L_dest_aligned_but_src_not_aligned_1byte: - mov r5, r7 - sub r5, r3 - cmphs r5, r4 - bf .L_copy_by_byte - cmplti r4, 16 - bf .L11 -.L10: /* If the len is less than 16 bytes */ - GET_FRONT_BITS r1 8 - mov r5, r1 - ldw r6, (r3, 0) - mov r1, r6 - GET_AFTER_BITS r6 24 - or r5, r6 - stw r5, (r7, 0) - subi r4, 4 - addi r3, 4 - addi r7, 4 - cmplti r4, 4 - bf .L10 - subi r3, 3 - br .L_copy_by_byte -.L11: - subi sp, 16 - stw r8, (sp, 0) - stw r9, (sp, 4) - stw r10, (sp, 8) - stw r11, (sp, 12) -.L12: - ldw r5, (r3, 0) - ldw r11, (r3, 4) - ldw r8, (r3, 8) - ldw r9, (r3, 12) - - GET_FRONT_BITS r1 8 /* little or big endian? */ - mov r10, r5 - GET_AFTER_BITS r5 24 - or r5, r1 - - GET_FRONT_BITS r10 8 - mov r1, r11 - GET_AFTER_BITS r11 24 - or r11, r10 - - GET_FRONT_BITS r1 8 - mov r10, r8 - GET_AFTER_BITS r8 24 - or r8, r1 - - GET_FRONT_BITS r10 8 - mov r1, r9 - GET_AFTER_BITS r9 24 - or r9, r10 - - stw r5, (r7, 0) - stw r11, (r7, 4) - stw r8, (r7, 8) - stw r9, (r7, 12) - subi r4, 16 - addi r3, 16 - addi r7, 16 - cmplti r4, 16 - jbf .L12 - ldw r8, (sp, 0) - ldw r9, (sp, 4) - ldw r10, (sp, 8) - ldw r11, (sp, 12) - addi sp , 16 - cmplti r4, 4 - bf .L10 - subi r3, 3 - br .L_copy_by_byte - -.L_dest_aligned_but_src_not_aligned_2bytes: - cmplti r4, 16 - bf .L21 -.L20: - GET_FRONT_BITS r1 16 - mov r5, r1 - ldw r6, (r3, 0) - mov r1, r6 - GET_AFTER_BITS r6 16 - or r5, r6 - stw r5, (r7, 0) - subi r4, 4 - addi r3, 4 - addi r7, 4 - cmplti r4, 4 - bf .L20 - subi r3, 2 - br .L_copy_by_byte - rts - -.L21: /* n > 16 */ - subi sp, 16 - stw r8, (sp, 0) - stw r9, (sp, 4) - stw r10, (sp, 8) - stw r11, (sp, 12) - -.L22: - ldw r5, (r3, 0) - ldw r11, (r3, 4) - ldw r8, (r3, 8) - ldw r9, (r3, 12) - - GET_FRONT_BITS r1 16 - mov r10, r5 - GET_AFTER_BITS r5 16 - or r5, r1 - - GET_FRONT_BITS r10 16 - mov r1, r11 - GET_AFTER_BITS r11 16 - or r11, r10 - - GET_FRONT_BITS r1 16 - mov r10, r8 - GET_AFTER_BITS r8 16 - or r8, r1 - - GET_FRONT_BITS r10 16 - mov r1, r9 - GET_AFTER_BITS r9 16 - or r9, r10 - - stw r5, (r7, 0) - stw r11, (r7, 4) - stw r8, (r7, 8) - stw r9, (r7, 12) - subi r4, 16 - addi r3, 16 - addi r7, 16 - cmplti r4, 16 - jbf .L22 - ldw r8, (sp, 0) - ldw r9, (sp, 4) - ldw r10, (sp, 8) - ldw r11, (sp, 12) - addi sp, 16 - cmplti r4, 4 - bf .L20 - subi r3, 2 - br .L_copy_by_byte - - -.L_dest_aligned_but_src_not_aligned_3bytes: - cmplti r4, 16 - bf .L31 -.L30: - GET_FRONT_BITS r1 24 - mov r5, r1 - ldw r6, (r3, 0) - mov r1, r6 - GET_AFTER_BITS r6 8 - or r5, r6 - stw r5, (r7, 0) - subi r4, 4 - addi r3, 4 - addi r7, 4 - cmplti r4, 4 - bf .L30 - subi r3, 1 - br .L_copy_by_byte -.L31: - subi sp, 16 - stw r8, (sp, 0) - stw r9, (sp, 4) - stw r10, (sp, 8) - stw r11, (sp, 12) -.L32: - ldw r5, (r3, 0) - ldw r11, (r3, 4) - ldw r8, (r3, 8) - ldw r9, (r3, 12) - - GET_FRONT_BITS r1 24 - mov r10, r5 - GET_AFTER_BITS r5 8 - or r5, r1 - - GET_FRONT_BITS r10 24 - mov r1, r11 - GET_AFTER_BITS r11 8 - or r11, r10 - - GET_FRONT_BITS r1 24 - mov r10, r8 - GET_AFTER_BITS r8 8 - or r8, r1 - - GET_FRONT_BITS r10 24 - mov r1, r9 - GET_AFTER_BITS r9 8 - or r9, r10 - - stw r5, (r7, 0) - stw r11, (r7, 4) - stw r8, (r7, 8) - stw r9, (r7, 12) - subi r4, 16 - addi r3, 16 - addi r7, 16 - cmplti r4, 16 - jbf .L32 - ldw r8, (sp, 0) - ldw r9, (sp, 4) - ldw r10, (sp, 8) - ldw r11, (sp, 12) - addi sp, 16 - cmplti r4, 4 - bf .L30 - subi r3, 1 - br .L_copy_by_byte diff --git a/arch/csky/abiv1/strksyms.c b/arch/csky/abiv1/strksyms.c deleted file mode 100644 index c7ccbb2..0000000 --- a/arch/csky/abiv1/strksyms.c +++ /dev/null @@ -1,6 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. - -#include - -EXPORT_SYMBOL(memcpy); diff --git a/arch/csky/abiv2/Makefile b/arch/csky/abiv2/Makefile index c561efa..ea8005f 100644 --- a/arch/csky/abiv2/Makefile +++ b/arch/csky/abiv2/Makefile @@ -2,9 +2,11 @@ obj-y += cacheflush.o obj-$(CONFIG_CPU_HAS_FPU) += fpu.o obj-y += memcmp.o +ifeq ($(CONFIG_HAVE_EFFICIENT_UNALIGNED_STRING_OPS), y) obj-y += memcpy.o obj-y += memmove.o obj-y += memset.o +endif obj-y += strcmp.o obj-y += strcpy.o obj-y += strlen.o diff --git a/arch/csky/abiv2/strksyms.c b/arch/csky/abiv2/strksyms.c index 06da723..8d1fd28 100644 --- a/arch/csky/abiv2/strksyms.c +++ b/arch/csky/abiv2/strksyms.c @@ -3,10 +3,12 @@ #include +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_STRING_OPS EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memset); -EXPORT_SYMBOL(memcmp); EXPORT_SYMBOL(memmove); +#endif +EXPORT_SYMBOL(memcmp); EXPORT_SYMBOL(strcmp); EXPORT_SYMBOL(strcpy); EXPORT_SYMBOL(strlen); diff --git a/arch/csky/lib/Makefile b/arch/csky/lib/Makefile index 7fbdbb2..d0ce6e2 100644 --- a/arch/csky/lib/Makefile +++ b/arch/csky/lib/Makefile @@ -1,3 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only lib-y := usercopy.o delay.o obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o +ifneq ($(CONFIG_HAVE_EFFICIENT_UNALIGNED_STRING_OPS), y) +lib-y += string.o +endif diff --git a/arch/csky/lib/string.c b/arch/csky/lib/string.c new file mode 100644 index 0000000..d65626f --- /dev/null +++ b/arch/csky/lib/string.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * String functions optimized for hardware which doesn't + * handle unaligned memory accesses efficiently. + * + * Copyright (C) 2021 Matteo Croce + */ + +#include +#include + +/* Minimum size for a word copy to be convenient */ +#define BYTES_LONG sizeof(long) +#define WORD_MASK (BYTES_LONG - 1) +#define MIN_THRESHOLD (BYTES_LONG * 2) + +/* convenience union to avoid cast between different pointer types */ +union types { + u8 *as_u8; + unsigned long *as_ulong; + uintptr_t as_uptr; +}; + +union const_types { + const u8 *as_u8; + unsigned long *as_ulong; + uintptr_t as_uptr; +}; + +void *memcpy(void *dest, const void *src, size_t count) +{ + union const_types s = { .as_u8 = src }; + union types d = { .as_u8 = dest }; + int distance = 0; + + if (count < MIN_THRESHOLD) + goto copy_remainder; + + /* Copy a byte at time until destination is aligned. */ + for (; d.as_uptr & WORD_MASK; count--) + *d.as_u8++ = *s.as_u8++; + + distance = s.as_uptr & WORD_MASK; + + if (distance) { + unsigned long last, next; + + /* + * s is distance bytes ahead of d, and d just reached + * the alignment boundary. Move s backward to word align it + * and shift data to compensate for distance, in order to do + * word-by-word copy. + */ + s.as_u8 -= distance; + + next = s.as_ulong[0]; + for (; count >= BYTES_LONG; count -= BYTES_LONG) { + last = next; + next = s.as_ulong[1]; + + d.as_ulong[0] = last >> (distance * 8) | + next << ((BYTES_LONG - distance) * 8); + + d.as_ulong++; + s.as_ulong++; + } + + /* Restore s with the original offset. */ + s.as_u8 += distance; + } else { + /* + * If the source and dest lower bits are the same, do a simple + * 32/64 bit wide copy. + */ + for (; count >= BYTES_LONG; count -= BYTES_LONG) + *d.as_ulong++ = *s.as_ulong++; + } + +copy_remainder: + while (count--) + *d.as_u8++ = *s.as_u8++; + + return dest; +} +EXPORT_SYMBOL(memcpy); + +/* + * Simply check if the buffer overlaps an call memcpy() in case, + * otherwise do a simple one byte at time backward copy. + */ +void *memmove(void *dest, const void *src, size_t count) +{ + if (dest < src || src + count <= dest) + return memcpy(dest, src, count); + + if (dest > src) { + const char *s = src + count; + char *tmp = dest + count; + + while (count--) + *--tmp = *--s; + } + return dest; +} +EXPORT_SYMBOL(memmove); + +void *memset(void *s, int c, size_t count) +{ + union types dest = { .as_u8 = s }; + + if (count >= MIN_THRESHOLD) { + unsigned long cu = (unsigned long)c; + + /* Compose an ulong with 'c' repeated 4/8 times */ + cu |= cu << 8; + cu |= cu << 16; + /* Suppress warning on 32 bit machines */ + cu |= (cu << 16) << 16; + + for (; count && dest.as_uptr & WORD_MASK; count--) + *dest.as_u8++ = c; + + /* Copy using the largest size allowed */ + for (; count >= BYTES_LONG; count -= BYTES_LONG) + *dest.as_ulong++ = cu; + } + + /* copy the remainder */ + while (count--) + *dest.as_u8++ = c; + + return s; +} +EXPORT_SYMBOL(memset);