From: Ard Biesheuvel Date: Mon, 13 Dec 2021 14:02:52 +0000 (+0100) Subject: arm64/xor: use EOR3 instructions when available X-Git-Tag: v6.1-rc5~2154^2~3^5 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=2c54b423cf85baed5ad9f9546f6c8ea741774a06;p=platform%2Fkernel%2Flinux-starfive.git arm64/xor: use EOR3 instructions when available Use the EOR3 instruction to implement xor_blocks() if the instruction is available, which is the case if the CPU implements the SHA-3 extension. This is about 20% faster on Apple M1 when using the 5-way version. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20211213140252.2856053-1-ardb@kernel.org Signed-off-by: Catalin Marinas --- diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c4207cf..63d41ba 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1545,6 +1545,12 @@ endmenu menu "ARMv8.2 architectural features" +config AS_HAS_ARMV8_2 + def_bool $(cc-option,-Wa$(comma)-march=armv8.2-a) + +config AS_HAS_SHA3 + def_bool $(as-instr,.arch armv8.2-a+sha3) + config ARM64_PMEM bool "Enable support for persistent memory" select ARCH_HAS_PMEM_API diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index e8cfc58..2f1de88 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -58,6 +58,11 @@ stack_protector_prepare: prepare0 include/generated/asm-offsets.h)) endif +ifeq ($(CONFIG_AS_HAS_ARMV8_2), y) +# make sure to pass the newest target architecture to -march. +asm-arch := armv8.2-a +endif + # Ensure that if the compiler supports branch protection we default it # off, this will be overridden if we are using branch protection. branch-prot-flags-y += $(call cc-option,-mbranch-protection=none) diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c index 11bf4f8a..d189cf4 100644 --- a/arch/arm64/lib/xor-neon.c +++ b/arch/arm64/lib/xor-neon.c @@ -167,7 +167,7 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1, } while (--lines > 0); } -struct xor_block_template const xor_block_inner_neon = { +struct xor_block_template xor_block_inner_neon __ro_after_init = { .name = "__inner_neon__", .do_2 = xor_arm64_neon_2, .do_3 = xor_arm64_neon_3, @@ -176,6 +176,151 @@ struct xor_block_template const xor_block_inner_neon = { }; EXPORT_SYMBOL(xor_block_inner_neon); +static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r) +{ + uint64x2_t res; + + asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n" + "eor3 %0.16b, %1.16b, %2.16b, %3.16b" + : "=w"(res) : "w"(p), "w"(q), "w"(r)); + return res; +} + +static void xor_arm64_eor3_3(unsigned long bytes, unsigned long *p1, + unsigned long *p2, unsigned long *p3) +{ + uint64_t *dp1 = (uint64_t *)p1; + uint64_t *dp2 = (uint64_t *)p2; + uint64_t *dp3 = (uint64_t *)p3; + + register uint64x2_t v0, v1, v2, v3; + long lines = bytes / (sizeof(uint64x2_t) * 4); + + do { + /* p1 ^= p2 ^ p3 */ + v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), + vld1q_u64(dp3 + 0)); + v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), + vld1q_u64(dp3 + 2)); + v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), + vld1q_u64(dp3 + 4)); + v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), + vld1q_u64(dp3 + 6)); + + /* store */ + vst1q_u64(dp1 + 0, v0); + vst1q_u64(dp1 + 2, v1); + vst1q_u64(dp1 + 4, v2); + vst1q_u64(dp1 + 6, v3); + + dp1 += 8; + dp2 += 8; + dp3 += 8; + } while (--lines > 0); +} + +static void xor_arm64_eor3_4(unsigned long bytes, unsigned long *p1, + unsigned long *p2, unsigned long *p3, + unsigned long *p4) +{ + uint64_t *dp1 = (uint64_t *)p1; + uint64_t *dp2 = (uint64_t *)p2; + uint64_t *dp3 = (uint64_t *)p3; + uint64_t *dp4 = (uint64_t *)p4; + + register uint64x2_t v0, v1, v2, v3; + long lines = bytes / (sizeof(uint64x2_t) * 4); + + do { + /* p1 ^= p2 ^ p3 */ + v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), + vld1q_u64(dp3 + 0)); + v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), + vld1q_u64(dp3 + 2)); + v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), + vld1q_u64(dp3 + 4)); + v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), + vld1q_u64(dp3 + 6)); + + /* p1 ^= p4 */ + v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); + v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); + v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); + v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); + + /* store */ + vst1q_u64(dp1 + 0, v0); + vst1q_u64(dp1 + 2, v1); + vst1q_u64(dp1 + 4, v2); + vst1q_u64(dp1 + 6, v3); + + dp1 += 8; + dp2 += 8; + dp3 += 8; + dp4 += 8; + } while (--lines > 0); +} + +static void xor_arm64_eor3_5(unsigned long bytes, unsigned long *p1, + unsigned long *p2, unsigned long *p3, + unsigned long *p4, unsigned long *p5) +{ + uint64_t *dp1 = (uint64_t *)p1; + uint64_t *dp2 = (uint64_t *)p2; + uint64_t *dp3 = (uint64_t *)p3; + uint64_t *dp4 = (uint64_t *)p4; + uint64_t *dp5 = (uint64_t *)p5; + + register uint64x2_t v0, v1, v2, v3; + long lines = bytes / (sizeof(uint64x2_t) * 4); + + do { + /* p1 ^= p2 ^ p3 */ + v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), + vld1q_u64(dp3 + 0)); + v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), + vld1q_u64(dp3 + 2)); + v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), + vld1q_u64(dp3 + 4)); + v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), + vld1q_u64(dp3 + 6)); + + /* p1 ^= p4 ^ p5 */ + v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0)); + v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2)); + v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4)); + v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6)); + + /* store */ + vst1q_u64(dp1 + 0, v0); + vst1q_u64(dp1 + 2, v1); + vst1q_u64(dp1 + 4, v2); + vst1q_u64(dp1 + 6, v3); + + dp1 += 8; + dp2 += 8; + dp3 += 8; + dp4 += 8; + dp5 += 8; + } while (--lines > 0); +} + +static int __init xor_neon_init(void) +{ + if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) { + xor_block_inner_neon.do_3 = xor_arm64_eor3_3; + xor_block_inner_neon.do_4 = xor_arm64_eor3_4; + xor_block_inner_neon.do_5 = xor_arm64_eor3_5; + } + return 0; +} +module_init(xor_neon_init); + +static void __exit xor_neon_exit(void) +{ +} +module_exit(xor_neon_exit); + MODULE_AUTHOR("Jackie Liu "); MODULE_DESCRIPTION("ARMv8 XOR Extensions"); MODULE_LICENSE("GPL");