csky: Add C based string functions
authorMatteo Croce <mcroce@microsoft.com>
Wed, 30 Mar 2022 12:07:14 +0000 (20:07 +0800)
committerGuo Ren <guoren@linux.alibaba.com>
Mon, 18 Apr 2022 13:23:55 +0000 (21:23 +0800)
Try to access RAM with the largest bit width possible, but without
doing unaligned accesses.

A further improvement could be to use multiple read and writes as the
assembly version was trying to do.

Tested on a BeagleV Starlight with a SiFive U74 core, where the
improvement is noticeable.

Signed-off-by: Matteo Croce <mcroce@microsoft.com>
Co-developed-by: Guo Ren <guoren@linux.alibaba.com>
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
arch/csky/Kconfig
arch/csky/abiv1/Makefile
arch/csky/abiv1/memcpy.S [deleted file]
arch/csky/abiv1/strksyms.c [deleted file]
arch/csky/abiv2/Makefile
arch/csky/abiv2/strksyms.c
arch/csky/lib/Makefile
arch/csky/lib/string.c [new file with mode: 0644]

index 75ef866..21d72b0 100644 (file)
@@ -320,6 +320,14 @@ config HOTPLUG_CPU
          controlled through /sys/devices/system/cpu/cpu1/hotplug/target.
 
          Say N if you want to disable CPU hotplug.
+
+config HAVE_EFFICIENT_UNALIGNED_STRING_OPS
+       bool "Enable EFFICIENT_UNALIGNED_STRING_OPS for abiv2"
+       depends on CPU_CK807 || CPU_CK810 || CPU_CK860
+       help
+         Say Y here to enable EFFICIENT_UNALIGNED_STRING_OPS. Some CPU models could
+         deal with unaligned access by hardware.
+
 endmenu
 
 source "arch/csky/Kconfig.platforms"
index 601ce3b..a4b2ade 100644 (file)
@@ -4,5 +4,3 @@ obj-y                                   += bswapdi.o
 obj-y                                  += bswapsi.o
 obj-y                                  += cacheflush.o
 obj-y                                  += mmap.o
-obj-y                                  += memcpy.o
-obj-y                                  += strksyms.o
diff --git a/arch/csky/abiv1/memcpy.S b/arch/csky/abiv1/memcpy.S
deleted file mode 100644 (file)
index 5078eb5..0000000
+++ /dev/null
@@ -1,347 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
-
-#include <linux/linkage.h>
-
-.macro GET_FRONT_BITS rx y
-#ifdef __cskyLE__
-       lsri    \rx, \y
-#else
-       lsli    \rx, \y
-#endif
-.endm
-
-.macro GET_AFTER_BITS rx y
-#ifdef __cskyLE__
-       lsli    \rx, \y
-#else
-       lsri    \rx, \y
-#endif
-.endm
-
-/* void *memcpy(void *dest, const void *src, size_t n); */
-ENTRY(memcpy)
-       mov     r7, r2
-       cmplti  r4, 4
-       bt      .L_copy_by_byte
-       mov     r6, r2
-       andi    r6, 3
-       cmpnei  r6, 0
-       jbt     .L_dest_not_aligned
-       mov     r6, r3
-       andi    r6, 3
-       cmpnei  r6, 0
-       jbt     .L_dest_aligned_but_src_not_aligned
-.L0:
-       cmplti  r4, 16
-       jbt     .L_aligned_and_len_less_16bytes
-       subi    sp, 8
-       stw     r8, (sp, 0)
-.L_aligned_and_len_larger_16bytes:
-       ldw     r1, (r3, 0)
-       ldw     r5, (r3, 4)
-       ldw     r8, (r3, 8)
-       stw     r1, (r7, 0)
-       ldw     r1, (r3, 12)
-       stw     r5, (r7, 4)
-       stw     r8, (r7, 8)
-       stw     r1, (r7, 12)
-       subi    r4, 16
-       addi    r3, 16
-       addi    r7, 16
-       cmplti  r4, 16
-       jbf     .L_aligned_and_len_larger_16bytes
-       ldw     r8, (sp, 0)
-       addi    sp, 8
-       cmpnei  r4, 0
-       jbf     .L_return
-
-.L_aligned_and_len_less_16bytes:
-       cmplti  r4, 4
-       bt      .L_copy_by_byte
-.L1:
-       ldw     r1, (r3, 0)
-       stw     r1, (r7, 0)
-       subi    r4, 4
-       addi    r3, 4
-       addi    r7, 4
-       cmplti  r4, 4
-       jbf     .L1
-       br      .L_copy_by_byte
-
-.L_return:
-       rts
-
-.L_copy_by_byte:                      /* len less than 4 bytes */
-       cmpnei  r4, 0
-       jbf     .L_return
-.L4:
-       ldb     r1, (r3, 0)
-       stb     r1, (r7, 0)
-       addi    r3, 1
-       addi    r7, 1
-       decne   r4
-       jbt     .L4
-       rts
-
-/*
- * If dest is not aligned, just copying some bytes makes the dest align.
- * Afther that, we judge whether the src is aligned.
- */
-.L_dest_not_aligned:
-       mov     r5, r3
-       rsub    r5, r5, r7
-       abs     r5, r5
-       cmplt   r5, r4
-       bt      .L_copy_by_byte
-       mov     r5, r7
-       sub     r5, r3
-       cmphs   r5, r4
-       bf      .L_copy_by_byte
-       mov     r5, r6
-.L5:
-       ldb     r1, (r3, 0)              /* makes the dest align. */
-       stb     r1, (r7, 0)
-       addi    r5, 1
-       subi    r4, 1
-       addi    r3, 1
-       addi    r7, 1
-       cmpnei  r5, 4
-       jbt     .L5
-       cmplti  r4, 4
-       jbt     .L_copy_by_byte
-       mov     r6, r3                   /* judge whether the src is aligned. */
-       andi    r6, 3
-       cmpnei  r6, 0
-       jbf     .L0
-
-/* Judge the number of misaligned, 1, 2, 3? */
-.L_dest_aligned_but_src_not_aligned:
-       mov     r5, r3
-       rsub    r5, r5, r7
-       abs     r5, r5
-       cmplt   r5, r4
-       bt      .L_copy_by_byte
-       bclri   r3, 0
-       bclri   r3, 1
-       ldw     r1, (r3, 0)
-       addi    r3, 4
-       cmpnei  r6, 2
-       bf      .L_dest_aligned_but_src_not_aligned_2bytes
-       cmpnei  r6, 3
-       bf      .L_dest_aligned_but_src_not_aligned_3bytes
-
-.L_dest_aligned_but_src_not_aligned_1byte:
-       mov     r5, r7
-       sub     r5, r3
-       cmphs   r5, r4
-       bf      .L_copy_by_byte
-       cmplti  r4, 16
-       bf      .L11
-.L10:                                     /* If the len is less than 16 bytes */
-       GET_FRONT_BITS r1 8
-       mov     r5, r1
-       ldw     r6, (r3, 0)
-       mov     r1, r6
-       GET_AFTER_BITS r6 24
-       or      r5, r6
-       stw     r5, (r7, 0)
-       subi    r4, 4
-       addi    r3, 4
-       addi    r7, 4
-       cmplti  r4, 4
-       bf      .L10
-       subi    r3, 3
-       br      .L_copy_by_byte
-.L11:
-       subi    sp, 16
-       stw     r8, (sp, 0)
-       stw     r9, (sp, 4)
-       stw     r10, (sp, 8)
-       stw     r11, (sp, 12)
-.L12:
-       ldw     r5, (r3, 0)
-       ldw     r11, (r3, 4)
-       ldw     r8, (r3, 8)
-       ldw     r9, (r3, 12)
-
-       GET_FRONT_BITS r1 8               /* little or big endian? */
-       mov     r10, r5
-       GET_AFTER_BITS r5 24
-       or      r5, r1
-
-       GET_FRONT_BITS r10 8
-       mov     r1, r11
-       GET_AFTER_BITS r11 24
-       or      r11, r10
-
-       GET_FRONT_BITS r1 8
-       mov     r10, r8
-       GET_AFTER_BITS r8 24
-       or      r8, r1
-
-       GET_FRONT_BITS r10 8
-       mov     r1, r9
-       GET_AFTER_BITS r9 24
-       or      r9, r10
-
-       stw     r5, (r7, 0)
-       stw     r11, (r7, 4)
-       stw     r8, (r7, 8)
-       stw     r9, (r7, 12)
-       subi    r4, 16
-       addi    r3, 16
-       addi    r7, 16
-       cmplti  r4, 16
-       jbf     .L12
-       ldw     r8, (sp, 0)
-       ldw     r9, (sp, 4)
-       ldw     r10, (sp, 8)
-       ldw     r11, (sp, 12)
-       addi    sp , 16
-       cmplti  r4, 4
-       bf      .L10
-       subi    r3, 3
-       br      .L_copy_by_byte
-
-.L_dest_aligned_but_src_not_aligned_2bytes:
-       cmplti  r4, 16
-       bf      .L21
-.L20:
-       GET_FRONT_BITS r1 16
-       mov     r5, r1
-       ldw     r6, (r3, 0)
-       mov     r1, r6
-       GET_AFTER_BITS r6 16
-       or      r5, r6
-       stw     r5, (r7, 0)
-       subi    r4, 4
-       addi    r3, 4
-       addi    r7, 4
-       cmplti  r4, 4
-       bf      .L20
-       subi    r3, 2
-       br      .L_copy_by_byte
-       rts
-
-.L21:  /* n > 16 */
-       subi    sp, 16
-       stw     r8, (sp, 0)
-       stw     r9, (sp, 4)
-       stw     r10, (sp, 8)
-       stw     r11, (sp, 12)
-
-.L22:
-       ldw     r5, (r3, 0)
-       ldw     r11, (r3, 4)
-       ldw     r8, (r3, 8)
-       ldw     r9, (r3, 12)
-
-       GET_FRONT_BITS r1 16
-       mov     r10, r5
-       GET_AFTER_BITS r5 16
-       or      r5, r1
-
-       GET_FRONT_BITS r10 16
-       mov     r1, r11
-       GET_AFTER_BITS r11 16
-       or      r11, r10
-
-       GET_FRONT_BITS r1 16
-       mov     r10, r8
-       GET_AFTER_BITS r8 16
-       or      r8, r1
-
-       GET_FRONT_BITS r10 16
-       mov     r1, r9
-       GET_AFTER_BITS r9 16
-       or      r9, r10
-
-       stw     r5, (r7, 0)
-       stw     r11, (r7, 4)
-       stw     r8, (r7, 8)
-       stw     r9, (r7, 12)
-       subi    r4, 16
-       addi    r3, 16
-       addi    r7, 16
-       cmplti  r4, 16
-       jbf     .L22
-       ldw     r8, (sp, 0)
-       ldw     r9, (sp, 4)
-       ldw     r10, (sp, 8)
-       ldw     r11, (sp, 12)
-       addi    sp, 16
-       cmplti  r4, 4
-       bf      .L20
-       subi    r3, 2
-       br      .L_copy_by_byte
-
-
-.L_dest_aligned_but_src_not_aligned_3bytes:
-       cmplti  r4, 16
-       bf      .L31
-.L30:
-       GET_FRONT_BITS r1 24
-       mov     r5, r1
-       ldw     r6, (r3, 0)
-       mov     r1, r6
-       GET_AFTER_BITS r6 8
-       or      r5, r6
-       stw     r5, (r7, 0)
-       subi    r4, 4
-       addi    r3, 4
-       addi    r7, 4
-       cmplti  r4, 4
-       bf      .L30
-       subi    r3, 1
-       br      .L_copy_by_byte
-.L31:
-       subi    sp, 16
-       stw     r8, (sp, 0)
-       stw     r9, (sp, 4)
-       stw     r10, (sp, 8)
-       stw     r11, (sp, 12)
-.L32:
-       ldw     r5, (r3, 0)
-       ldw     r11, (r3, 4)
-       ldw     r8, (r3, 8)
-       ldw     r9, (r3, 12)
-
-       GET_FRONT_BITS r1 24
-       mov     r10, r5
-       GET_AFTER_BITS r5 8
-       or      r5, r1
-
-       GET_FRONT_BITS r10 24
-       mov     r1, r11
-       GET_AFTER_BITS r11 8
-       or      r11, r10
-
-       GET_FRONT_BITS r1 24
-       mov     r10, r8
-       GET_AFTER_BITS r8 8
-       or      r8, r1
-
-       GET_FRONT_BITS r10 24
-       mov     r1, r9
-       GET_AFTER_BITS r9 8
-       or      r9, r10
-
-       stw     r5, (r7, 0)
-       stw     r11, (r7, 4)
-       stw     r8, (r7, 8)
-       stw     r9, (r7, 12)
-       subi    r4, 16
-       addi    r3, 16
-       addi    r7, 16
-       cmplti  r4, 16
-       jbf     .L32
-       ldw     r8, (sp, 0)
-       ldw     r9, (sp, 4)
-       ldw     r10, (sp, 8)
-       ldw     r11, (sp, 12)
-       addi    sp, 16
-       cmplti  r4, 4
-       bf      .L30
-       subi    r3, 1
-       br      .L_copy_by_byte
diff --git a/arch/csky/abiv1/strksyms.c b/arch/csky/abiv1/strksyms.c
deleted file mode 100644 (file)
index c7ccbb2..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
-
-#include <linux/module.h>
-
-EXPORT_SYMBOL(memcpy);
index c561efa..ea8005f 100644 (file)
@@ -2,9 +2,11 @@
 obj-y                          += cacheflush.o
 obj-$(CONFIG_CPU_HAS_FPU)      += fpu.o
 obj-y                          += memcmp.o
+ifeq ($(CONFIG_HAVE_EFFICIENT_UNALIGNED_STRING_OPS), y)
 obj-y                          += memcpy.o
 obj-y                          += memmove.o
 obj-y                          += memset.o
+endif
 obj-y                          += strcmp.o
 obj-y                          += strcpy.o
 obj-y                          += strlen.o
index 06da723..8d1fd28 100644 (file)
@@ -3,10 +3,12 @@
 
 #include <linux/module.h>
 
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_STRING_OPS
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memset);
-EXPORT_SYMBOL(memcmp);
 EXPORT_SYMBOL(memmove);
+#endif
+EXPORT_SYMBOL(memcmp);
 EXPORT_SYMBOL(strcmp);
 EXPORT_SYMBOL(strcpy);
 EXPORT_SYMBOL(strlen);
index 7fbdbb2..d0ce6e2 100644 (file)
@@ -1,3 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 lib-y  := usercopy.o delay.o
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
+ifneq ($(CONFIG_HAVE_EFFICIENT_UNALIGNED_STRING_OPS), y)
+lib-y  += string.o
+endif
diff --git a/arch/csky/lib/string.c b/arch/csky/lib/string.c
new file mode 100644 (file)
index 0000000..d65626f
--- /dev/null
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * String functions optimized for hardware which doesn't
+ * handle unaligned memory accesses efficiently.
+ *
+ * Copyright (C) 2021 Matteo Croce
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+
+/* Minimum size for a word copy to be convenient */
+#define BYTES_LONG     sizeof(long)
+#define WORD_MASK      (BYTES_LONG - 1)
+#define MIN_THRESHOLD  (BYTES_LONG * 2)
+
+/* convenience union to avoid cast between different pointer types */
+union types {
+       u8 *as_u8;
+       unsigned long *as_ulong;
+       uintptr_t as_uptr;
+};
+
+union const_types {
+       const u8 *as_u8;
+       unsigned long *as_ulong;
+       uintptr_t as_uptr;
+};
+
+void *memcpy(void *dest, const void *src, size_t count)
+{
+       union const_types s = { .as_u8 = src };
+       union types d = { .as_u8 = dest };
+       int distance = 0;
+
+       if (count < MIN_THRESHOLD)
+               goto copy_remainder;
+
+       /* Copy a byte at time until destination is aligned. */
+       for (; d.as_uptr & WORD_MASK; count--)
+               *d.as_u8++ = *s.as_u8++;
+
+       distance = s.as_uptr & WORD_MASK;
+
+       if (distance) {
+               unsigned long last, next;
+
+               /*
+                * s is distance bytes ahead of d, and d just reached
+                * the alignment boundary. Move s backward to word align it
+                * and shift data to compensate for distance, in order to do
+                * word-by-word copy.
+                */
+               s.as_u8 -= distance;
+
+               next = s.as_ulong[0];
+               for (; count >= BYTES_LONG; count -= BYTES_LONG) {
+                       last = next;
+                       next = s.as_ulong[1];
+
+                       d.as_ulong[0] = last >> (distance * 8) |
+                               next << ((BYTES_LONG - distance) * 8);
+
+                       d.as_ulong++;
+                       s.as_ulong++;
+               }
+
+               /* Restore s with the original offset. */
+               s.as_u8 += distance;
+       } else {
+               /*
+                * If the source and dest lower bits are the same, do a simple
+                * 32/64 bit wide copy.
+                */
+               for (; count >= BYTES_LONG; count -= BYTES_LONG)
+                       *d.as_ulong++ = *s.as_ulong++;
+       }
+
+copy_remainder:
+       while (count--)
+               *d.as_u8++ = *s.as_u8++;
+
+       return dest;
+}
+EXPORT_SYMBOL(memcpy);
+
+/*
+ * Simply check if the buffer overlaps an call memcpy() in case,
+ * otherwise do a simple one byte at time backward copy.
+ */
+void *memmove(void *dest, const void *src, size_t count)
+{
+       if (dest < src || src + count <= dest)
+               return memcpy(dest, src, count);
+
+       if (dest > src) {
+               const char *s = src + count;
+               char *tmp = dest + count;
+
+               while (count--)
+                       *--tmp = *--s;
+       }
+       return dest;
+}
+EXPORT_SYMBOL(memmove);
+
+void *memset(void *s, int c, size_t count)
+{
+       union types dest = { .as_u8 = s };
+
+       if (count >= MIN_THRESHOLD) {
+               unsigned long cu = (unsigned long)c;
+
+               /* Compose an ulong with 'c' repeated 4/8 times */
+               cu |= cu << 8;
+               cu |= cu << 16;
+               /* Suppress warning on 32 bit machines */
+               cu |= (cu << 16) << 16;
+
+               for (; count && dest.as_uptr & WORD_MASK; count--)
+                       *dest.as_u8++ = c;
+
+               /* Copy using the largest size allowed */
+               for (; count >= BYTES_LONG; count -= BYTES_LONG)
+                       *dest.as_ulong++ = cu;
+       }
+
+       /* copy the remainder */
+       while (count--)
+               *dest.as_u8++ = c;
+
+       return s;
+}
+EXPORT_SYMBOL(memset);