LoongArch: Add subword xchg/cmpxchg emulation
authorHuacai Chen <chenhuacai@loongson.cn>
Thu, 25 Aug 2022 11:34:59 +0000 (19:34 +0800)
committerHuacai Chen <chenhuacai@loongson.cn>
Thu, 25 Aug 2022 11:34:59 +0000 (19:34 +0800)
LoongArch only support 32-bit/64-bit xchg/cmpxchg in native. But percpu
operation, qspinlock and some drivers need 8-bit/16-bit xchg/cmpxchg. We
add subword xchg/cmpxchg emulation in this patch because the emulation
has better performance than the generic implementation (on NUMA system),
and it can fix some build errors meanwhile [1].

LoongArch's guarantee for forward progress (avoid many ll/sc happening
at the same time and no one succeeds):

We have the "exclusive access (with timeout) of ll" feature to avoid
simultaneous ll (which also blocks other memory load/store on the same
address), and the "random delay of sc" feature to avoid simultaneous
sc. It is a mandatory requirement for multi-core LoongArch processors
to implement such features, only except those single-core and dual-core
processors (they also don't support multi-chip interconnection).

Feature bits are introduced in CPUCFG3, bit 3 and bit 4 [2].

[1] https://lore.kernel.org/loongarch/CAAhV-H6vvkuOzy8OemWdYK3taj5Jn3bFX0ZTwE=twM8ywpBUYA@mail.gmail.com/T/#t
[2] https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#_cpucfg

Reported-by: Sudip Mukherjee (Codethink) <sudipm.mukherjee@gmail.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Rui Wang <wangrui@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
arch/loongarch/include/asm/cmpxchg.h
arch/loongarch/include/asm/percpu.h

index 0a9b0fac1eeeb6115bfcd444d35b1975f45a1277..ae19e33c77548aed5f94c59f0c727c1d535eb647 100644 (file)
@@ -5,8 +5,9 @@
 #ifndef __ASM_CMPXCHG_H
 #define __ASM_CMPXCHG_H
 
-#include <asm/barrier.h>
+#include <linux/bits.h>
 #include <linux/build_bug.h>
+#include <asm/barrier.h>
 
 #define __xchg_asm(amswap_db, m, val)          \
 ({                                             \
                __ret;                          \
 })
 
+static inline unsigned int __xchg_small(volatile void *ptr, unsigned int val,
+                                       unsigned int size)
+{
+       unsigned int shift;
+       u32 old32, mask, temp;
+       volatile u32 *ptr32;
+
+       /* Mask value to the correct size. */
+       mask = GENMASK((size * BITS_PER_BYTE) - 1, 0);
+       val &= mask;
+
+       /*
+        * Calculate a shift & mask that correspond to the value we wish to
+        * exchange within the naturally aligned 4 byte integerthat includes
+        * it.
+        */
+       shift = (unsigned long)ptr & 0x3;
+       shift *= BITS_PER_BYTE;
+       mask <<= shift;
+
+       /*
+        * Calculate a pointer to the naturally aligned 4 byte integer that
+        * includes our byte of interest, and load its value.
+        */
+       ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3);
+
+       asm volatile (
+       "1:     ll.w            %0, %3          \n"
+       "       andn            %1, %0, %z4     \n"
+       "       or              %1, %1, %z5     \n"
+       "       sc.w            %1, %2          \n"
+       "       beqz            %1, 1b          \n"
+       : "=&r" (old32), "=&r" (temp), "=ZC" (*ptr32)
+       : "ZC" (*ptr32), "Jr" (mask), "Jr" (val << shift)
+       : "memory");
+
+       return (old32 & mask) >> shift;
+}
+
 static inline unsigned long __xchg(volatile void *ptr, unsigned long x,
                                   int size)
 {
        switch (size) {
+       case 1:
+       case 2:
+               return __xchg_small(ptr, x, size);
+
        case 4:
                return __xchg_asm("amswap_db.w", (volatile u32 *)ptr, (u32)x);
 
@@ -67,10 +111,62 @@ static inline unsigned long __xchg(volatile void *ptr, unsigned long x,
        __ret;                                                          \
 })
 
+static inline unsigned int __cmpxchg_small(volatile void *ptr, unsigned int old,
+                                          unsigned int new, unsigned int size)
+{
+       unsigned int shift;
+       u32 old32, mask, temp;
+       volatile u32 *ptr32;
+
+       /* Mask inputs to the correct size. */
+       mask = GENMASK((size * BITS_PER_BYTE) - 1, 0);
+       old &= mask;
+       new &= mask;
+
+       /*
+        * Calculate a shift & mask that correspond to the value we wish to
+        * compare & exchange within the naturally aligned 4 byte integer
+        * that includes it.
+        */
+       shift = (unsigned long)ptr & 0x3;
+       shift *= BITS_PER_BYTE;
+       old <<= shift;
+       new <<= shift;
+       mask <<= shift;
+
+       /*
+        * Calculate a pointer to the naturally aligned 4 byte integer that
+        * includes our byte of interest, and load its value.
+        */
+       ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3);
+
+       asm volatile (
+       "1:     ll.w            %0, %3          \n"
+       "       and             %1, %0, %z4     \n"
+       "       bne             %1, %z5, 2f     \n"
+       "       andn            %1, %0, %z4     \n"
+       "       or              %1, %1, %z6     \n"
+       "       sc.w            %1, %2          \n"
+       "       beqz            %1, 1b          \n"
+       "       b               3f              \n"
+       "2:                                     \n"
+       __WEAK_LLSC_MB
+       "3:                                     \n"
+       : "=&r" (old32), "=&r" (temp), "=ZC" (*ptr32)
+       : "ZC" (*ptr32), "Jr" (mask), "Jr" (old), "Jr" (new)
+       : "memory");
+
+       return (old32 & mask) >> shift;
+}
+
 static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
                                      unsigned long new, unsigned int size)
 {
        switch (size) {
+       case 1:
+       case 2:
+               return __cmpxchg_small(ptr, old, new, size);
+
        case 4:
                return __cmpxchg_asm("ll.w", "sc.w", (volatile u32 *)ptr,
                                     (u32)old, new);
index e6569f18c6ddfcd5e8d75bde426c4e4b5a7071fe..0bd6b0110198f776a7bc154f052cfc0d370ac1d8 100644 (file)
@@ -123,6 +123,10 @@ static inline unsigned long __percpu_xchg(void *ptr, unsigned long val,
                                                int size)
 {
        switch (size) {
+       case 1:
+       case 2:
+               return __xchg_small((volatile void *)ptr, val, size);
+
        case 4:
                return __xchg_asm("amswap.w", (volatile u32 *)ptr, (u32)val);
 
@@ -204,9 +208,13 @@ do {                                                                       \
 #define this_cpu_write_4(pcp, val) _percpu_write(pcp, val)
 #define this_cpu_write_8(pcp, val) _percpu_write(pcp, val)
 
+#define this_cpu_xchg_1(pcp, val) _percpu_xchg(pcp, val)
+#define this_cpu_xchg_2(pcp, val) _percpu_xchg(pcp, val)
 #define this_cpu_xchg_4(pcp, val) _percpu_xchg(pcp, val)
 #define this_cpu_xchg_8(pcp, val) _percpu_xchg(pcp, val)
 
+#define this_cpu_cmpxchg_1(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
+#define this_cpu_cmpxchg_2(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
 #define this_cpu_cmpxchg_4(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
 #define this_cpu_cmpxchg_8(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)