md/raid6: implement recovery using ARM NEON intrinsics
authorArd Biesheuvel <ard.biesheuvel@linaro.org>
Thu, 13 Jul 2017 17:16:01 +0000 (18:16 +0100)
committerCatalin Marinas <catalin.marinas@arm.com>
Wed, 9 Aug 2017 17:52:07 +0000 (18:52 +0100)
Provide a NEON accelerated implementation of the recovery algorithm,
which supersedes the default byte-by-byte one.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
include/linux/raid/pq.h
lib/raid6/Makefile
lib/raid6/algos.c
lib/raid6/recov_neon.c [new file with mode: 0644]
lib/raid6/recov_neon_inner.c [new file with mode: 0644]

index 30f9453..583cdd3 100644 (file)
@@ -121,6 +121,7 @@ extern const struct raid6_recov_calls raid6_recov_ssse3;
 extern const struct raid6_recov_calls raid6_recov_avx2;
 extern const struct raid6_recov_calls raid6_recov_avx512;
 extern const struct raid6_recov_calls raid6_recov_s390xc;
+extern const struct raid6_recov_calls raid6_recov_neon;
 
 extern const struct raid6_calls raid6_neonx1;
 extern const struct raid6_calls raid6_neonx2;
index 3057011..a93adf6 100644 (file)
@@ -5,7 +5,7 @@ raid6_pq-y      += algos.o recov.o tables.o int1.o int2.o int4.o \
 
 raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o recov_avx512.o
 raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
-raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o
+raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
 raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
 raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
 
@@ -26,7 +26,9 @@ NEON_FLAGS := -ffreestanding
 ifeq ($(ARCH),arm)
 NEON_FLAGS += -mfloat-abi=softfp -mfpu=neon
 endif
+CFLAGS_recov_neon_inner.o += $(NEON_FLAGS)
 ifeq ($(ARCH),arm64)
+CFLAGS_REMOVE_recov_neon_inner.o += -mgeneral-regs-only
 CFLAGS_REMOVE_neon1.o += -mgeneral-regs-only
 CFLAGS_REMOVE_neon2.o += -mgeneral-regs-only
 CFLAGS_REMOVE_neon4.o += -mgeneral-regs-only
index 7857049..4769947 100644 (file)
@@ -113,6 +113,9 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = {
 #ifdef CONFIG_S390
        &raid6_recov_s390xc,
 #endif
+#if defined(CONFIG_KERNEL_MODE_NEON)
+       &raid6_recov_neon,
+#endif
        &raid6_recov_intx1,
        NULL
 };
diff --git a/lib/raid6/recov_neon.c b/lib/raid6/recov_neon.c
new file mode 100644 (file)
index 0000000..eeb5c40
--- /dev/null
@@ -0,0 +1,110 @@
+/*
+ * Copyright (C) 2012 Intel Corporation
+ * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/raid/pq.h>
+
+#ifdef __KERNEL__
+#include <asm/neon.h>
+#else
+#define kernel_neon_begin()
+#define kernel_neon_end()
+#define cpu_has_neon()         (1)
+#endif
+
+static int raid6_has_neon(void)
+{
+       return cpu_has_neon();
+}
+
+void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
+                             uint8_t *dq, const uint8_t *pbmul,
+                             const uint8_t *qmul);
+
+void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
+                             const uint8_t *qmul);
+
+static void raid6_2data_recov_neon(int disks, size_t bytes, int faila,
+               int failb, void **ptrs)
+{
+       u8 *p, *q, *dp, *dq;
+       const u8 *pbmul;        /* P multiplier table for B data */
+       const u8 *qmul;         /* Q multiplier table (for both) */
+
+       p = (u8 *)ptrs[disks - 2];
+       q = (u8 *)ptrs[disks - 1];
+
+       /*
+        * Compute syndrome with zero for the missing data pages
+        * Use the dead data pages as temporary storage for
+        * delta p and delta q
+        */
+       dp = (u8 *)ptrs[faila];
+       ptrs[faila] = (void *)raid6_empty_zero_page;
+       ptrs[disks - 2] = dp;
+       dq = (u8 *)ptrs[failb];
+       ptrs[failb] = (void *)raid6_empty_zero_page;
+       ptrs[disks - 1] = dq;
+
+       raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+       /* Restore pointer table */
+       ptrs[faila]     = dp;
+       ptrs[failb]     = dq;
+       ptrs[disks - 2] = p;
+       ptrs[disks - 1] = q;
+
+       /* Now, pick the proper data tables */
+       pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
+       qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
+                                        raid6_gfexp[failb]]];
+
+       kernel_neon_begin();
+       __raid6_2data_recov_neon(bytes, p, q, dp, dq, pbmul, qmul);
+       kernel_neon_end();
+}
+
+static void raid6_datap_recov_neon(int disks, size_t bytes, int faila,
+               void **ptrs)
+{
+       u8 *p, *q, *dq;
+       const u8 *qmul;         /* Q multiplier table */
+
+       p = (u8 *)ptrs[disks - 2];
+       q = (u8 *)ptrs[disks - 1];
+
+       /*
+        * Compute syndrome with zero for the missing data page
+        * Use the dead data page as temporary storage for delta q
+        */
+       dq = (u8 *)ptrs[faila];
+       ptrs[faila] = (void *)raid6_empty_zero_page;
+       ptrs[disks - 1] = dq;
+
+       raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+       /* Restore pointer table */
+       ptrs[faila]     = dq;
+       ptrs[disks - 1] = q;
+
+       /* Now, pick the proper data tables */
+       qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+       kernel_neon_begin();
+       __raid6_datap_recov_neon(bytes, p, q, dq, qmul);
+       kernel_neon_end();
+}
+
+const struct raid6_recov_calls raid6_recov_neon = {
+       .data2          = raid6_2data_recov_neon,
+       .datap          = raid6_datap_recov_neon,
+       .valid          = raid6_has_neon,
+       .name           = "neon",
+       .priority       = 10,
+};
diff --git a/lib/raid6/recov_neon_inner.c b/lib/raid6/recov_neon_inner.c
new file mode 100644 (file)
index 0000000..8cd20c9
--- /dev/null
@@ -0,0 +1,117 @@
+/*
+ * Copyright (C) 2012 Intel Corporation
+ * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <arm_neon.h>
+
+static const uint8x16_t x0f = {
+       0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+       0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+};
+
+#ifdef CONFIG_ARM
+/*
+ * AArch32 does not provide this intrinsic natively because it does not
+ * implement the underlying instruction. AArch32 only provides a 64-bit
+ * wide vtbl.8 instruction, so use that instead.
+ */
+static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
+{
+       union {
+               uint8x16_t      val;
+               uint8x8x2_t     pair;
+       } __a = { a };
+
+       return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
+                          vtbl2_u8(__a.pair, vget_high_u8(b)));
+}
+#endif
+
+void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
+                             uint8_t *dq, const uint8_t *pbmul,
+                             const uint8_t *qmul)
+{
+       uint8x16_t pm0 = vld1q_u8(pbmul);
+       uint8x16_t pm1 = vld1q_u8(pbmul + 16);
+       uint8x16_t qm0 = vld1q_u8(qmul);
+       uint8x16_t qm1 = vld1q_u8(qmul + 16);
+
+       /*
+        * while ( bytes-- ) {
+        *      uint8_t px, qx, db;
+        *
+        *      px    = *p ^ *dp;
+        *      qx    = qmul[*q ^ *dq];
+        *      *dq++ = db = pbmul[px] ^ qx;
+        *      *dp++ = db ^ px;
+        *      p++; q++;
+        * }
+        */
+
+       while (bytes) {
+               uint8x16_t vx, vy, px, qx, db;
+
+               px = veorq_u8(vld1q_u8(p), vld1q_u8(dp));
+               vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
+
+               vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
+               vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
+               vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
+               qx = veorq_u8(vx, vy);
+
+               vy = (uint8x16_t)vshrq_n_s16((int16x8_t)px, 4);
+               vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f));
+               vy = vqtbl1q_u8(pm1, vandq_u8(vy, x0f));
+               vx = veorq_u8(vx, vy);
+               db = veorq_u8(vx, qx);
+
+               vst1q_u8(dq, db);
+               vst1q_u8(dp, veorq_u8(db, px));
+
+               bytes -= 16;
+               p += 16;
+               q += 16;
+               dp += 16;
+               dq += 16;
+       }
+}
+
+void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
+                             const uint8_t *qmul)
+{
+       uint8x16_t qm0 = vld1q_u8(qmul);
+       uint8x16_t qm1 = vld1q_u8(qmul + 16);
+
+       /*
+        * while (bytes--) {
+        *      *p++ ^= *dq = qmul[*q ^ *dq];
+        *      q++; dq++;
+        * }
+        */
+
+       while (bytes) {
+               uint8x16_t vx, vy;
+
+               vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
+
+               vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
+               vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
+               vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
+               vx = veorq_u8(vx, vy);
+               vy = veorq_u8(vx, vld1q_u8(p));
+
+               vst1q_u8(dq, vx);
+               vst1q_u8(p, vy);
+
+               bytes -= 16;
+               p += 16;
+               q += 16;
+               dq += 16;
+       }
+}