arm64/xor: use EOR3 instructions when available

author Ard Biesheuvel <ardb@kernel.org>

Mon, 13 Dec 2021 14:02:52 +0000 (15:02 +0100)

committer Catalin Marinas <catalin.marinas@arm.com>

Tue, 14 Dec 2021 12:14:26 +0000 (12:14 +0000)
author Ard Biesheuvel <ardb@kernel.org>
Mon, 13 Dec 2021 14:02:52 +0000 (15:02 +0100)
committer Catalin Marinas <catalin.marinas@arm.com>
Tue, 14 Dec 2021 12:14:26 +0000 (12:14 +0000)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig

index c4207cf..63d41ba 100644 (file)
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1545,6 +1545,12 @@ endmenu
  
  menu "ARMv8.2 architectural features"
  
+config AS_HAS_ARMV8_2
+       def_bool $(cc-option,-Wa$(comma)-march=armv8.2-a)
+
+config AS_HAS_SHA3
+       def_bool $(as-instr,.arch armv8.2-a+sha3)
+
  config ARM64_PMEM
         bool "Enable support for persistent memory"
         select ARCH_HAS_PMEM_API
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile

index e8cfc58..2f1de88 100644 (file)
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -58,6 +58,11 @@ stack_protector_prepare: prepare0
                                         include/generated/asm-offsets.h))
  endif
  
+ifeq ($(CONFIG_AS_HAS_ARMV8_2), y)
+# make sure to pass the newest target architecture to -march.
+asm-arch := armv8.2-a
+endif
+
  # Ensure that if the compiler supports branch protection we default it
  # off, this will be overridden if we are using branch protection.
  branch-prot-flags-y += $(call cc-option,-mbranch-protection=none)
diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c

index 11bf4f8..d189cf4 100644 (file)
--- a/arch/arm64/lib/xor-neon.c
+++ b/arch/arm64/lib/xor-neon.c
@@ -167,7 +167,7 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
         } while (--lines > 0);
  }
  
-struct xor_block_template const xor_block_inner_neon = {
+struct xor_block_template xor_block_inner_neon __ro_after_init = {
         .name   = "__inner_neon__",
         .do_2   = xor_arm64_neon_2,
         .do_3   = xor_arm64_neon_3,
@@ -176,6 +176,151 @@ struct xor_block_template const xor_block_inner_neon = {
  };
  EXPORT_SYMBOL(xor_block_inner_neon);
  
+static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
+{
+       uint64x2_t res;
+
+       asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
+           "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
+           : "=w"(res) : "w"(p), "w"(q), "w"(r));
+       return res;
+}
+
+static void xor_arm64_eor3_3(unsigned long bytes, unsigned long *p1,
+                            unsigned long *p2, unsigned long *p3)
+{
+       uint64_t *dp1 = (uint64_t *)p1;
+       uint64_t *dp2 = (uint64_t *)p2;
+       uint64_t *dp3 = (uint64_t *)p3;
+
+       register uint64x2_t v0, v1, v2, v3;
+       long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+       do {
+               /* p1 ^= p2 ^ p3 */
+               v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+                         vld1q_u64(dp3 + 0));
+               v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+                         vld1q_u64(dp3 + 2));
+               v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+                         vld1q_u64(dp3 + 4));
+               v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+                         vld1q_u64(dp3 + 6));
+
+               /* store */
+               vst1q_u64(dp1 + 0, v0);
+               vst1q_u64(dp1 + 2, v1);
+               vst1q_u64(dp1 + 4, v2);
+               vst1q_u64(dp1 + 6, v3);
+
+               dp1 += 8;
+               dp2 += 8;
+               dp3 += 8;
+       } while (--lines > 0);
+}
+
+static void xor_arm64_eor3_4(unsigned long bytes, unsigned long *p1,
+                            unsigned long *p2, unsigned long *p3,
+                            unsigned long *p4)
+{
+       uint64_t *dp1 = (uint64_t *)p1;
+       uint64_t *dp2 = (uint64_t *)p2;
+       uint64_t *dp3 = (uint64_t *)p3;
+       uint64_t *dp4 = (uint64_t *)p4;
+
+       register uint64x2_t v0, v1, v2, v3;
+       long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+       do {
+               /* p1 ^= p2 ^ p3 */
+               v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+                         vld1q_u64(dp3 + 0));
+               v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+                         vld1q_u64(dp3 + 2));
+               v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+                         vld1q_u64(dp3 + 4));
+               v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+                         vld1q_u64(dp3 + 6));
+
+               /* p1 ^= p4 */
+               v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
+               v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
+               v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
+               v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
+
+               /* store */
+               vst1q_u64(dp1 + 0, v0);
+               vst1q_u64(dp1 + 2, v1);
+               vst1q_u64(dp1 + 4, v2);
+               vst1q_u64(dp1 + 6, v3);
+
+               dp1 += 8;
+               dp2 += 8;
+               dp3 += 8;
+               dp4 += 8;
+       } while (--lines > 0);
+}
+
+static void xor_arm64_eor3_5(unsigned long bytes, unsigned long *p1,
+                            unsigned long *p2, unsigned long *p3,
+                            unsigned long *p4, unsigned long *p5)
+{
+       uint64_t *dp1 = (uint64_t *)p1;
+       uint64_t *dp2 = (uint64_t *)p2;
+       uint64_t *dp3 = (uint64_t *)p3;
+       uint64_t *dp4 = (uint64_t *)p4;
+       uint64_t *dp5 = (uint64_t *)p5;
+
+       register uint64x2_t v0, v1, v2, v3;
+       long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+       do {
+               /* p1 ^= p2 ^ p3 */
+               v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+                         vld1q_u64(dp3 + 0));
+               v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+                         vld1q_u64(dp3 + 2));
+               v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+                         vld1q_u64(dp3 + 4));
+               v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+                         vld1q_u64(dp3 + 6));
+
+               /* p1 ^= p4 ^ p5 */
+               v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0));
+               v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2));
+               v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4));
+               v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6));
+
+               /* store */
+               vst1q_u64(dp1 + 0, v0);
+               vst1q_u64(dp1 + 2, v1);
+               vst1q_u64(dp1 + 4, v2);
+               vst1q_u64(dp1 + 6, v3);
+
+               dp1 += 8;
+               dp2 += 8;
+               dp3 += 8;
+               dp4 += 8;
+               dp5 += 8;
+       } while (--lines > 0);
+}
+
+static int __init xor_neon_init(void)
+{
+       if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {
+               xor_block_inner_neon.do_3 = xor_arm64_eor3_3;
+               xor_block_inner_neon.do_4 = xor_arm64_eor3_4;
+               xor_block_inner_neon.do_5 = xor_arm64_eor3_5;
+       }
+       return 0;
+}
+module_init(xor_neon_init);
+
+static void __exit xor_neon_exit(void)
+{
+}
+module_exit(xor_neon_exit);
+
  MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
  MODULE_DESCRIPTION("ARMv8 XOR Extensions");
  MODULE_LICENSE("GPL");
author	Ard Biesheuvel <ardb@kernel.org>
	Mon, 13 Dec 2021 14:02:52 +0000 (15:02 +0100)
committer	Catalin Marinas <catalin.marinas@arm.com>
	Tue, 14 Dec 2021 12:14:26 +0000 (12:14 +0000)
arch/arm64/Kconfig		patch \| blob \| history
arch/arm64/Makefile		patch \| blob \| history
arch/arm64/lib/xor-neon.c		patch \| blob \| history