crypto: arm/curve25519 - wire up NEON implementation
authorJason A. Donenfeld <Jason@zx2c4.com>
Fri, 8 Nov 2019 12:22:38 +0000 (13:22 +0100)
committerHerbert Xu <herbert@gondor.apana.org.au>
Sun, 17 Nov 2019 01:02:44 +0000 (09:02 +0800)
This ports the SUPERCOP implementation for usage in kernel space. In
addition to the usual header, macro, and style changes required for
kernel space, it makes a few small changes to the code:

  - The stack alignment is relaxed to 16 bytes.
  - Superfluous mov statements have been removed.
  - ldr for constants has been replaced with movw.
  - ldreq has been replaced with moveq.
  - The str epilogue has been made more idiomatic.
  - SIMD registers are not pushed and popped at the beginning and end.
  - The prologue and epilogue have been made idiomatic.
  - A hole has been removed from the stack, saving 32 bytes.
  - We write-back the base register whenever possible for vld1.8.
  - Some multiplications have been reordered for better A7 performance.

There are more opportunities for cleanup, since this code is from qhasm,
which doesn't always do the most opportune thing. But even prior to
extensive hand optimizations, this code delivers significant performance
improvements (given in get_cycles() per call):

      ----------- -------------
             | generic C | this commit |
 ------------ ----------- -------------
| Cortex-A7  |     49136 |       22395 |
 ------------ ----------- -------------
| Cortex-A17 |     17326 |        4983 |
 ------------ ----------- -------------

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
[ardb: - move to arch/arm/crypto
       - wire into lib/crypto framework
       - implement crypto API KPP hooks ]
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
arch/arm/crypto/Kconfig
arch/arm/crypto/Makefile
arch/arm/crypto/curve25519-core.S
arch/arm/crypto/curve25519-glue.c [new file with mode: 0644]

index 166d32616feae6bb7a7ab50a46d6dd91071d5626..ab676229b0dae96e1e65cb3831aaaab44aab6cfa 100644 (file)
@@ -142,4 +142,10 @@ config CRYPTO_NHPOLY1305_NEON
        depends on KERNEL_MODE_NEON
        select CRYPTO_NHPOLY1305
 
+config CRYPTO_CURVE25519_NEON
+       tristate "NEON accelerated Curve25519 scalar multiplication library"
+       depends on KERNEL_MODE_NEON
+       select CRYPTO_LIB_CURVE25519_GENERIC
+       select CRYPTO_ARCH_HAVE_LIB_CURVE25519
+
 endif
index d568d699b3b7df20f967dd36cc7f7178d1dc4de1..b745c17d356fe6ea83989f1dd647427395a71304 100644 (file)
@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
 obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
 obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
 obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
+obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o
 
 obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
@@ -38,6 +39,7 @@ chacha-neon-y := chacha-scalar-core.o chacha-glue.o
 chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
 poly1305-arm-y := poly1305-core.o poly1305-glue.o
 nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
+curve25519-neon-y := curve25519-core.o curve25519-glue.o
 
 ifdef REGENERATE_ARM_CRYPTO
 quiet_cmd_perl = PERL    $@
index f33b85fef3823ca3b83c3f8e4a2070359244e47d..be18af52e7dc9a5657af1a9a3acb50f56f66d8d3 100644 (file)
@@ -1,43 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
 /*
- * Public domain code from Daniel J. Bernstein and Peter Schwabe, from
- * SUPERCOP's curve25519/neon2/scalarmult.s.
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
+ * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been
+ * manually reworked for use in kernel space.
  */
 
-.fpu neon
+#include <linux/linkage.h>
+
 .text
+.fpu neon
+.arch armv7-a
 .align 4
-.global _crypto_scalarmult_curve25519_neon2
-.global crypto_scalarmult_curve25519_neon2
-.type _crypto_scalarmult_curve25519_neon2 STT_FUNC
-.type crypto_scalarmult_curve25519_neon2 STT_FUNC
-       _crypto_scalarmult_curve25519_neon2:
-       crypto_scalarmult_curve25519_neon2:
-       vpush           {q4, q5, q6, q7}
-       mov             r12, sp
-       sub             sp, sp, #736
-       and             sp, sp, #0xffffffe0
-       strd            r4, [sp, #0]
-       strd            r6, [sp, #8]
-       strd            r8, [sp, #16]
-       strd            r10, [sp, #24]
-       str             r12, [sp, #480]
-       str             r14, [sp, #484]
-       mov             r0, r0
-       mov             r1, r1
-       mov             r2, r2
-       add             r3, sp, #32
-       ldr             r4, =0
-       ldr             r5, =254
+
+ENTRY(curve25519_neon)
+       push            {r4-r11, lr}
+       mov             ip, sp
+       sub             r3, sp, #704
+       and             r3, r3, #0xfffffff0
+       mov             sp, r3
+       movw            r4, #0
+       movw            r5, #254
        vmov.i32        q0, #1
        vshr.u64        q1, q0, #7
        vshr.u64        q0, q0, #8
        vmov.i32        d4, #19
        vmov.i32        d5, #38
-       add             r6, sp, #512
-       vst1.8          {d2-d3}, [r6, : 128]
-       add             r6, sp, #528
-       vst1.8          {d0-d1}, [r6, : 128]
-       add             r6, sp, #544
+       add             r6, sp, #480
+       vst1.8          {d2-d3}, [r6, : 128]!
+       vst1.8          {d0-d1}, [r6, : 128]!
        vst1.8          {d4-d5}, [r6, : 128]
        add             r6, r3, #0
        vmov.i32        q2, #0
        vst1.8          {d4-d5}, [r6, : 128]!
        vst1.8          d4, [r6, : 64]
        add             r6, r3, #0
-       ldr             r7, =960
+       movw            r7, #960
        sub             r7, r7, #2
        neg             r7, r7
        sub             r7, r7, r7, LSL #7
        str             r7, [r6]
-       add             r6, sp, #704
+       add             r6, sp, #672
        vld1.8          {d4-d5}, [r1]!
        vld1.8          {d6-d7}, [r1]
        vst1.8          {d4-d5}, [r6, : 128]!
        vst1.8          {d0-d1}, [r6, : 128]!
        vst1.8          {d2-d3}, [r6, : 128]!
        vst1.8          d4, [r6, : 64]
-._mainloop:
+.Lmainloop:
        mov             r2, r5, LSR #3
        and             r6, r5, #7
        ldrb            r2, [r1, r2]
        mov             r2, r2, LSR r6
        and             r2, r2, #1
-       str             r5, [sp, #488]
+       str             r5, [sp, #456]
        eor             r4, r4, r2
-       str             r2, [sp, #492]
+       str             r2, [sp, #460]
        neg             r2, r4
        add             r4, r3, #96
        add             r5, r3, #192
        vsub.i32        q0, q1, q3
        vst1.8          d4, [r4, : 64]
        vst1.8          d0, [r6, : 64]
-       add             r2, sp, #544
+       add             r2, sp, #512
        add             r4, r3, #96
        add             r5, r3, #144
        vld1.8          {d0-d1}, [r2, : 128]
        vmlal.s32       q0, d12, d8
        vmlal.s32       q0, d13, d17
        vmlal.s32       q0, d6, d6
-       add             r2, sp, #512
-       vld1.8          {d18-d19}, [r2, : 128]
+       add             r2, sp, #480
+       vld1.8          {d18-d19}, [r2, : 128]!
        vmull.s32       q3, d16, d7
        vmlal.s32       q3, d10, d15
        vmlal.s32       q3, d11, d14
        vmlal.s32       q3, d12, d9
        vmlal.s32       q3, d13, d8
-       add             r2, sp, #528
        vld1.8          {d8-d9}, [r2, : 128]
        vadd.i64        q5, q12, q9
        vadd.i64        q6, q15, q9
        vadd.i32        q5, q5, q0
        vtrn.32         q11, q14
        vadd.i32        q6, q6, q3
-       add             r2, sp, #560
+       add             r2, sp, #528
        vadd.i32        q10, q10, q2
        vtrn.32         d24, d25
-       vst1.8          {d12-d13}, [r2, : 128]
+       vst1.8          {d12-d13}, [r2, : 128]!
        vshl.i32        q6, q13, #1
-       add             r2, sp, #576
-       vst1.8          {d20-d21}, [r2, : 128]
+       vst1.8          {d20-d21}, [r2, : 128]!
        vshl.i32        q10, q14, #1
-       add             r2, sp, #592
-       vst1.8          {d12-d13}, [r2, : 128]
+       vst1.8          {d12-d13}, [r2, : 128]!
        vshl.i32        q15, q12, #1
        vadd.i32        q8, q8, q4
        vext.32         d10, d31, d30, #0
        vadd.i32        q7, q7, q1
-       add             r2, sp, #608
-       vst1.8          {d16-d17}, [r2, : 128]
+       vst1.8          {d16-d17}, [r2, : 128]!
        vmull.s32       q8, d18, d5
        vmlal.s32       q8, d26, d4
        vmlal.s32       q8, d19, d9
        vmlal.s32       q8, d29, d1
        vmlal.s32       q8, d24, d6
        vmlal.s32       q8, d25, d0
-       add             r2, sp, #624
-       vst1.8          {d14-d15}, [r2, : 128]
+       vst1.8          {d14-d15}, [r2, : 128]!
        vmull.s32       q2, d18, d4
        vmlal.s32       q2, d12, d9
        vmlal.s32       q2, d13, d8
        vmlal.s32       q2, d22, d2
        vmlal.s32       q2, d23, d1
        vmlal.s32       q2, d24, d0
-       add             r2, sp, #640
-       vst1.8          {d20-d21}, [r2, : 128]
+       vst1.8          {d20-d21}, [r2, : 128]!
        vmull.s32       q7, d18, d9
        vmlal.s32       q7, d26, d3
        vmlal.s32       q7, d19, d8
        vmlal.s32       q7, d28, d1
        vmlal.s32       q7, d23, d6
        vmlal.s32       q7, d29, d0
-       add             r2, sp, #656
-       vst1.8          {d10-d11}, [r2, : 128]
+       vst1.8          {d10-d11}, [r2, : 128]!
        vmull.s32       q5, d18, d3
        vmlal.s32       q5, d19, d2
        vmlal.s32       q5, d22, d1
        vmlal.s32       q5, d23, d0
        vmlal.s32       q5, d12, d8
-       add             r2, sp, #672
        vst1.8          {d16-d17}, [r2, : 128]
        vmull.s32       q4, d18, d8
        vmlal.s32       q4, d26, d2
        vmlal.s32       q8, d26, d1
        vmlal.s32       q8, d19, d6
        vmlal.s32       q8, d27, d0
-       add             r2, sp, #576
+       add             r2, sp, #544
        vld1.8          {d20-d21}, [r2, : 128]
        vmlal.s32       q7, d24, d21
        vmlal.s32       q7, d25, d20
        vmlal.s32       q8, d22, d21
        vmlal.s32       q8, d28, d20
        vmlal.s32       q5, d24, d20
-       add             r2, sp, #576
        vst1.8          {d14-d15}, [r2, : 128]
        vmull.s32       q7, d18, d6
        vmlal.s32       q7, d26, d0
-       add             r2, sp, #656
+       add             r2, sp, #624
        vld1.8          {d30-d31}, [r2, : 128]
        vmlal.s32       q2, d30, d21
        vmlal.s32       q7, d19, d21
        vmlal.s32       q7, d27, d20
-       add             r2, sp, #624
+       add             r2, sp, #592
        vld1.8          {d26-d27}, [r2, : 128]
        vmlal.s32       q4, d25, d27
        vmlal.s32       q8, d29, d27
        vmlal.s32       q8, d25, d26
        vmlal.s32       q7, d28, d27
        vmlal.s32       q7, d29, d26
-       add             r2, sp, #608
+       add             r2, sp, #576
        vld1.8          {d28-d29}, [r2, : 128]
        vmlal.s32       q4, d24, d29
        vmlal.s32       q8, d23, d29
        vmlal.s32       q8, d24, d28
        vmlal.s32       q7, d22, d29
        vmlal.s32       q7, d23, d28
-       add             r2, sp, #608
        vst1.8          {d8-d9}, [r2, : 128]
-       add             r2, sp, #560
+       add             r2, sp, #528
        vld1.8          {d8-d9}, [r2, : 128]
        vmlal.s32       q7, d24, d9
        vmlal.s32       q7, d25, d31
        vmlal.s32       q0, d23, d26
        vmlal.s32       q0, d24, d31
        vmlal.s32       q0, d19, d20
-       add             r2, sp, #640
+       add             r2, sp, #608
        vld1.8          {d18-d19}, [r2, : 128]
        vmlal.s32       q2, d18, d7
-       vmlal.s32       q2, d19, d6
        vmlal.s32       q5, d18, d6
-       vmlal.s32       q5, d19, d21
        vmlal.s32       q1, d18, d21
-       vmlal.s32       q1, d19, d29
        vmlal.s32       q0, d18, d28
-       vmlal.s32       q0, d19, d9
        vmlal.s32       q6, d18, d29
+       vmlal.s32       q2, d19, d6
+       vmlal.s32       q5, d19, d21
+       vmlal.s32       q1, d19, d29
+       vmlal.s32       q0, d19, d9
        vmlal.s32       q6, d19, d28
-       add             r2, sp, #592
+       add             r2, sp, #560
        vld1.8          {d18-d19}, [r2, : 128]
-       add             r2, sp, #512
+       add             r2, sp, #480
        vld1.8          {d22-d23}, [r2, : 128]
        vmlal.s32       q5, d19, d7
        vmlal.s32       q0, d18, d21
        vmlal.s32       q0, d19, d29
        vmlal.s32       q6, d18, d6
-       add             r2, sp, #528
+       add             r2, sp, #496
        vld1.8          {d6-d7}, [r2, : 128]
        vmlal.s32       q6, d19, d21
-       add             r2, sp, #576
+       add             r2, sp, #544
        vld1.8          {d18-d19}, [r2, : 128]
        vmlal.s32       q0, d30, d8
-       add             r2, sp, #672
+       add             r2, sp, #640
        vld1.8          {d20-d21}, [r2, : 128]
        vmlal.s32       q5, d30, d29
-       add             r2, sp, #608
+       add             r2, sp, #576
        vld1.8          {d24-d25}, [r2, : 128]
        vmlal.s32       q1, d30, d28
        vadd.i64        q13, q0, q11
        vadd.i32        q5, q5, q0
        vtrn.32         q11, q14
        vadd.i32        q6, q6, q3
-       add             r2, sp, #560
+       add             r2, sp, #528
        vadd.i32        q10, q10, q2
        vtrn.32         d24, d25
-       vst1.8          {d12-d13}, [r2, : 128]
+       vst1.8          {d12-d13}, [r2, : 128]!
        vshl.i32        q6, q13, #1
-       add             r2, sp, #576
-       vst1.8          {d20-d21}, [r2, : 128]
+       vst1.8          {d20-d21}, [r2, : 128]!
        vshl.i32        q10, q14, #1
-       add             r2, sp, #592
-       vst1.8          {d12-d13}, [r2, : 128]
+       vst1.8          {d12-d13}, [r2, : 128]!
        vshl.i32        q15, q12, #1
        vadd.i32        q8, q8, q4
        vext.32         d10, d31, d30, #0
        vadd.i32        q7, q7, q1
-       add             r2, sp, #608
-       vst1.8          {d16-d17}, [r2, : 128]
+       vst1.8          {d16-d17}, [r2, : 128]!
        vmull.s32       q8, d18, d5
        vmlal.s32       q8, d26, d4
        vmlal.s32       q8, d19, d9
        vmlal.s32       q8, d29, d1
        vmlal.s32       q8, d24, d6
        vmlal.s32       q8, d25, d0
-       add             r2, sp, #624
-       vst1.8          {d14-d15}, [r2, : 128]
+       vst1.8          {d14-d15}, [r2, : 128]!
        vmull.s32       q2, d18, d4
        vmlal.s32       q2, d12, d9
        vmlal.s32       q2, d13, d8
        vmlal.s32       q2, d22, d2
        vmlal.s32       q2, d23, d1
        vmlal.s32       q2, d24, d0
-       add             r2, sp, #640
-       vst1.8          {d20-d21}, [r2, : 128]
+       vst1.8          {d20-d21}, [r2, : 128]!
        vmull.s32       q7, d18, d9
        vmlal.s32       q7, d26, d3
        vmlal.s32       q7, d19, d8
        vmlal.s32       q7, d28, d1
        vmlal.s32       q7, d23, d6
        vmlal.s32       q7, d29, d0
-       add             r2, sp, #656
-       vst1.8          {d10-d11}, [r2, : 128]
+       vst1.8          {d10-d11}, [r2, : 128]!
        vmull.s32       q5, d18, d3
        vmlal.s32       q5, d19, d2
        vmlal.s32       q5, d22, d1
        vmlal.s32       q5, d23, d0
        vmlal.s32       q5, d12, d8
-       add             r2, sp, #672
-       vst1.8          {d16-d17}, [r2, : 128]
+       vst1.8          {d16-d17}, [r2, : 128]!
        vmull.s32       q4, d18, d8
        vmlal.s32       q4, d26, d2
        vmlal.s32       q4, d19, d7
        vmlal.s32       q8, d26, d1
        vmlal.s32       q8, d19, d6
        vmlal.s32       q8, d27, d0
-       add             r2, sp, #576
+       add             r2, sp, #544
        vld1.8          {d20-d21}, [r2, : 128]
        vmlal.s32       q7, d24, d21
        vmlal.s32       q7, d25, d20
        vmlal.s32       q8, d22, d21
        vmlal.s32       q8, d28, d20
        vmlal.s32       q5, d24, d20
-       add             r2, sp, #576
        vst1.8          {d14-d15}, [r2, : 128]
        vmull.s32       q7, d18, d6
        vmlal.s32       q7, d26, d0
-       add             r2, sp, #656
+       add             r2, sp, #624
        vld1.8          {d30-d31}, [r2, : 128]
        vmlal.s32       q2, d30, d21
        vmlal.s32       q7, d19, d21
        vmlal.s32       q7, d27, d20
-       add             r2, sp, #624
+       add             r2, sp, #592
        vld1.8          {d26-d27}, [r2, : 128]
        vmlal.s32       q4, d25, d27
        vmlal.s32       q8, d29, d27
        vmlal.s32       q8, d25, d26
        vmlal.s32       q7, d28, d27
        vmlal.s32       q7, d29, d26
-       add             r2, sp, #608
+       add             r2, sp, #576
        vld1.8          {d28-d29}, [r2, : 128]
        vmlal.s32       q4, d24, d29
        vmlal.s32       q8, d23, d29
        vmlal.s32       q8, d24, d28
        vmlal.s32       q7, d22, d29
        vmlal.s32       q7, d23, d28
-       add             r2, sp, #608
        vst1.8          {d8-d9}, [r2, : 128]
-       add             r2, sp, #560
+       add             r2, sp, #528
        vld1.8          {d8-d9}, [r2, : 128]
        vmlal.s32       q7, d24, d9
        vmlal.s32       q7, d25, d31
        vmlal.s32       q0, d23, d26
        vmlal.s32       q0, d24, d31
        vmlal.s32       q0, d19, d20
-       add             r2, sp, #640
+       add             r2, sp, #608
        vld1.8          {d18-d19}, [r2, : 128]
        vmlal.s32       q2, d18, d7
-       vmlal.s32       q2, d19, d6
        vmlal.s32       q5, d18, d6
-       vmlal.s32       q5, d19, d21
        vmlal.s32       q1, d18, d21
-       vmlal.s32       q1, d19, d29
        vmlal.s32       q0, d18, d28
-       vmlal.s32       q0, d19, d9
        vmlal.s32       q6, d18, d29
+       vmlal.s32       q2, d19, d6
+       vmlal.s32       q5, d19, d21
+       vmlal.s32       q1, d19, d29
+       vmlal.s32       q0, d19, d9
        vmlal.s32       q6, d19, d28
-       add             r2, sp, #592
+       add             r2, sp, #560
        vld1.8          {d18-d19}, [r2, : 128]
-       add             r2, sp, #512
+       add             r2, sp, #480
        vld1.8          {d22-d23}, [r2, : 128]
        vmlal.s32       q5, d19, d7
        vmlal.s32       q0, d18, d21
        vmlal.s32       q0, d19, d29
        vmlal.s32       q6, d18, d6
-       add             r2, sp, #528
+       add             r2, sp, #496
        vld1.8          {d6-d7}, [r2, : 128]
        vmlal.s32       q6, d19, d21
-       add             r2, sp, #576
+       add             r2, sp, #544
        vld1.8          {d18-d19}, [r2, : 128]
        vmlal.s32       q0, d30, d8
-       add             r2, sp, #672
+       add             r2, sp, #640
        vld1.8          {d20-d21}, [r2, : 128]
        vmlal.s32       q5, d30, d29
-       add             r2, sp, #608
+       add             r2, sp, #576
        vld1.8          {d24-d25}, [r2, : 128]
        vmlal.s32       q1, d30, d28
        vadd.i64        q13, q0, q11
        sub             r4, r4, #24
        vst1.8          d0, [r2, : 64]
        vst1.8          d1, [r4, : 64]
-       add             r2, sp, #544
+       add             r2, sp, #512
        add             r4, r3, #144
        add             r5, r3, #192
        vld1.8          {d0-d1}, [r2, : 128]
        vmlal.s32       q0, d12, d8
        vmlal.s32       q0, d13, d17
        vmlal.s32       q0, d6, d6
-       add             r2, sp, #512
-       vld1.8          {d18-d19}, [r2, : 128]
+       add             r2, sp, #480
+       vld1.8          {d18-d19}, [r2, : 128]!
        vmull.s32       q3, d16, d7
        vmlal.s32       q3, d10, d15
        vmlal.s32       q3, d11, d14
        vmlal.s32       q3, d12, d9
        vmlal.s32       q3, d13, d8
-       add             r2, sp, #528
        vld1.8          {d8-d9}, [r2, : 128]
        vadd.i64        q5, q12, q9
        vadd.i64        q6, q15, q9
        vadd.i32        q5, q5, q0
        vtrn.32         q11, q14
        vadd.i32        q6, q6, q3
-       add             r2, sp, #560
+       add             r2, sp, #528
        vadd.i32        q10, q10, q2
        vtrn.32         d24, d25
-       vst1.8          {d12-d13}, [r2, : 128]
+       vst1.8          {d12-d13}, [r2, : 128]!
        vshl.i32        q6, q13, #1
-       add             r2, sp, #576
-       vst1.8          {d20-d21}, [r2, : 128]
+       vst1.8          {d20-d21}, [r2, : 128]!
        vshl.i32        q10, q14, #1
-       add             r2, sp, #592
-       vst1.8          {d12-d13}, [r2, : 128]
+       vst1.8          {d12-d13}, [r2, : 128]!
        vshl.i32        q15, q12, #1
        vadd.i32        q8, q8, q4
        vext.32         d10, d31, d30, #0
        vadd.i32        q7, q7, q1
-       add             r2, sp, #608
-       vst1.8          {d16-d17}, [r2, : 128]
+       vst1.8          {d16-d17}, [r2, : 128]!
        vmull.s32       q8, d18, d5
        vmlal.s32       q8, d26, d4
        vmlal.s32       q8, d19, d9
        vmlal.s32       q8, d29, d1
        vmlal.s32       q8, d24, d6
        vmlal.s32       q8, d25, d0
-       add             r2, sp, #624
-       vst1.8          {d14-d15}, [r2, : 128]
+       vst1.8          {d14-d15}, [r2, : 128]!
        vmull.s32       q2, d18, d4
        vmlal.s32       q2, d12, d9
        vmlal.s32       q2, d13, d8
        vmlal.s32       q2, d22, d2
        vmlal.s32       q2, d23, d1
        vmlal.s32       q2, d24, d0
-       add             r2, sp, #640
-       vst1.8          {d20-d21}, [r2, : 128]
+       vst1.8          {d20-d21}, [r2, : 128]!
        vmull.s32       q7, d18, d9
        vmlal.s32       q7, d26, d3
        vmlal.s32       q7, d19, d8
        vmlal.s32       q7, d28, d1
        vmlal.s32       q7, d23, d6
        vmlal.s32       q7, d29, d0
-       add             r2, sp, #656
-       vst1.8          {d10-d11}, [r2, : 128]
+       vst1.8          {d10-d11}, [r2, : 128]!
        vmull.s32       q5, d18, d3
        vmlal.s32       q5, d19, d2
        vmlal.s32       q5, d22, d1
        vmlal.s32       q5, d23, d0
        vmlal.s32       q5, d12, d8
-       add             r2, sp, #672
-       vst1.8          {d16-d17}, [r2, : 128]
+       vst1.8          {d16-d17}, [r2, : 128]!
        vmull.s32       q4, d18, d8
        vmlal.s32       q4, d26, d2
        vmlal.s32       q4, d19, d7
        vmlal.s32       q8, d26, d1
        vmlal.s32       q8, d19, d6
        vmlal.s32       q8, d27, d0
-       add             r2, sp, #576
+       add             r2, sp, #544
        vld1.8          {d20-d21}, [r2, : 128]
        vmlal.s32       q7, d24, d21
        vmlal.s32       q7, d25, d20
        vmlal.s32       q8, d22, d21
        vmlal.s32       q8, d28, d20
        vmlal.s32       q5, d24, d20
-       add             r2, sp, #576
        vst1.8          {d14-d15}, [r2, : 128]
        vmull.s32       q7, d18, d6
        vmlal.s32       q7, d26, d0
-       add             r2, sp, #656
+       add             r2, sp, #624
        vld1.8          {d30-d31}, [r2, : 128]
        vmlal.s32       q2, d30, d21
        vmlal.s32       q7, d19, d21
        vmlal.s32       q7, d27, d20
-       add             r2, sp, #624
+       add             r2, sp, #592
        vld1.8          {d26-d27}, [r2, : 128]
        vmlal.s32       q4, d25, d27
        vmlal.s32       q8, d29, d27
        vmlal.s32       q8, d25, d26
        vmlal.s32       q7, d28, d27
        vmlal.s32       q7, d29, d26
-       add             r2, sp, #608
+       add             r2, sp, #576
        vld1.8          {d28-d29}, [r2, : 128]
        vmlal.s32       q4, d24, d29
        vmlal.s32       q8, d23, d29
        vmlal.s32       q8, d24, d28
        vmlal.s32       q7, d22, d29
        vmlal.s32       q7, d23, d28
-       add             r2, sp, #608
        vst1.8          {d8-d9}, [r2, : 128]
-       add             r2, sp, #560
+       add             r2, sp, #528
        vld1.8          {d8-d9}, [r2, : 128]
        vmlal.s32       q7, d24, d9
        vmlal.s32       q7, d25, d31
        vmlal.s32       q0, d23, d26
        vmlal.s32       q0, d24, d31
        vmlal.s32       q0, d19, d20
-       add             r2, sp, #640
+       add             r2, sp, #608
        vld1.8          {d18-d19}, [r2, : 128]
        vmlal.s32       q2, d18, d7
-       vmlal.s32       q2, d19, d6
        vmlal.s32       q5, d18, d6
-       vmlal.s32       q5, d19, d21
        vmlal.s32       q1, d18, d21
-       vmlal.s32       q1, d19, d29
        vmlal.s32       q0, d18, d28
-       vmlal.s32       q0, d19, d9
        vmlal.s32       q6, d18, d29
+       vmlal.s32       q2, d19, d6
+       vmlal.s32       q5, d19, d21
+       vmlal.s32       q1, d19, d29
+       vmlal.s32       q0, d19, d9
        vmlal.s32       q6, d19, d28
-       add             r2, sp, #592
+       add             r2, sp, #560
        vld1.8          {d18-d19}, [r2, : 128]
-       add             r2, sp, #512
+       add             r2, sp, #480
        vld1.8          {d22-d23}, [r2, : 128]
        vmlal.s32       q5, d19, d7
        vmlal.s32       q0, d18, d21
        vmlal.s32       q0, d19, d29
        vmlal.s32       q6, d18, d6
-       add             r2, sp, #528
+       add             r2, sp, #496
        vld1.8          {d6-d7}, [r2, : 128]
        vmlal.s32       q6, d19, d21
-       add             r2, sp, #576
+       add             r2, sp, #544
        vld1.8          {d18-d19}, [r2, : 128]
        vmlal.s32       q0, d30, d8
-       add             r2, sp, #672
+       add             r2, sp, #640
        vld1.8          {d20-d21}, [r2, : 128]
        vmlal.s32       q5, d30, d29
-       add             r2, sp, #608
+       add             r2, sp, #576
        vld1.8          {d24-d25}, [r2, : 128]
        vmlal.s32       q1, d30, d28
        vadd.i64        q13, q0, q11
        sub             r4, r4, #24
        vst1.8          d0, [r2, : 64]
        vst1.8          d1, [r4, : 64]
-       ldr             r2, [sp, #488]
-       ldr             r4, [sp, #492]
+       ldr             r2, [sp, #456]
+       ldr             r4, [sp, #460]
        subs            r5, r2, #1
-       bge             ._mainloop
+       bge             .Lmainloop
        add             r1, r3, #144
        add             r2, r3, #336
        vld1.8          {d0-d1}, [r1, : 128]!
        vst1.8          {d0-d1}, [r2, : 128]!
        vst1.8          {d2-d3}, [r2, : 128]!
        vst1.8          d4, [r2, : 64]
-       ldr             r1, =0
-._invertloop:
+       movw            r1, #0
+.Linvertloop:
        add             r2, r3, #144
-       ldr             r4, =0
-       ldr             r5, =2
+       movw            r4, #0
+       movw            r5, #2
        cmp             r1, #1
-       ldreq           r5, =1
+       moveq           r5, #1
        addeq           r2, r3, #336
        addeq           r4, r3, #48
        cmp             r1, #2
-       ldreq           r5, =1
+       moveq           r5, #1
        addeq           r2, r3, #48
        cmp             r1, #3
-       ldreq           r5, =5
+       moveq           r5, #5
        addeq           r4, r3, #336
        cmp             r1, #4
-       ldreq           r5, =10
+       moveq           r5, #10
        cmp             r1, #5
-       ldreq           r5, =20
+       moveq           r5, #20
        cmp             r1, #6
-       ldreq           r5, =10
+       moveq           r5, #10
        addeq           r2, r3, #336
        addeq           r4, r3, #336
        cmp             r1, #7
-       ldreq           r5, =50
+       moveq           r5, #50
        cmp             r1, #8
-       ldreq           r5, =100
+       moveq           r5, #100
        cmp             r1, #9
-       ldreq           r5, =50
+       moveq           r5, #50
        addeq           r2, r3, #336
        cmp             r1, #10
-       ldreq           r5, =5
+       moveq           r5, #5
        addeq           r2, r3, #48
        cmp             r1, #11
-       ldreq           r5, =0
+       moveq           r5, #0
        addeq           r2, r3, #96
        add             r6, r3, #144
        add             r7, r3, #288
        vst1.8          {d2-d3}, [r7, : 128]!
        vst1.8          d4, [r7, : 64]
        cmp             r5, #0
-       beq             ._skipsquaringloop
-._squaringloop:
+       beq             .Lskipsquaringloop
+.Lsquaringloop:
        add             r6, r3, #288
        add             r7, r3, #288
        add             r8, r3, #288
        vld1.8          {d6-d7}, [r7, : 128]!
        vld1.8          {d9}, [r7, : 64]
        vld1.8          {d10-d11}, [r6, : 128]!
-       add             r7, sp, #416
+       add             r7, sp, #384
        vld1.8          {d12-d13}, [r6, : 128]!
        vmul.i32        q7, q2, q0
        vld1.8          {d8}, [r6, : 64]
        vext.32         d10, d6, d6, #0
        vmov.i32        q1, #0xffffffff
        vshl.i64        q4, q1, #25
-       add             r7, sp, #512
+       add             r7, sp, #480
        vld1.8          {d14-d15}, [r7, : 128]
        vadd.i64        q9, q2, q7
        vshl.i64        q1, q1, #26
        vadd.i64        q5, q5, q10
        vand            q9, q9, q1
        vld1.8          {d16}, [r6, : 64]!
-       add             r6, sp, #528
+       add             r6, sp, #496
        vld1.8          {d20-d21}, [r6, : 128]
        vadd.i64        q11, q5, q10
        vsub.i64        q2, q2, q9
        sub             r6, r6, #32
        vst1.8          d4, [r6, : 64]
        subs            r5, r5, #1
-       bhi             ._squaringloop
-._skipsquaringloop:
+       bhi             .Lsquaringloop
+.Lskipsquaringloop:
        mov             r2, r2
        add             r5, r3, #288
        add             r6, r3, #144
        vld1.8          {d6-d7}, [r5, : 128]!
        vld1.8          {d9}, [r5, : 64]
        vld1.8          {d10-d11}, [r2, : 128]!
-       add             r5, sp, #416
+       add             r5, sp, #384
        vld1.8          {d12-d13}, [r2, : 128]!
        vmul.i32        q7, q2, q0
        vld1.8          {d8}, [r2, : 64]
        vext.32         d10, d6, d6, #0
        vmov.i32        q1, #0xffffffff
        vshl.i64        q4, q1, #25
-       add             r5, sp, #512
+       add             r5, sp, #480
        vld1.8          {d14-d15}, [r5, : 128]
        vadd.i64        q9, q2, q7
        vshl.i64        q1, q1, #26
        vadd.i64        q5, q5, q10
        vand            q9, q9, q1
        vld1.8          {d16}, [r2, : 64]!
-       add             r2, sp, #528
+       add             r2, sp, #496
        vld1.8          {d20-d21}, [r2, : 128]
        vadd.i64        q11, q5, q10
        vsub.i64        q2, q2, q9
        sub             r2, r2, #32
        vst1.8          d4, [r2, : 64]
        cmp             r4, #0
-       beq             ._skippostcopy
+       beq             .Lskippostcopy
        add             r2, r3, #144
        mov             r4, r4
        vld1.8          {d0-d1}, [r2, : 128]!
        vst1.8          {d0-d1}, [r4, : 128]!
        vst1.8          {d2-d3}, [r4, : 128]!
        vst1.8          d4, [r4, : 64]
-._skippostcopy:
+.Lskippostcopy:
        cmp             r1, #1
-       bne             ._skipfinalcopy
+       bne             .Lskipfinalcopy
        add             r2, r3, #288
        add             r4, r3, #144
        vld1.8          {d0-d1}, [r2, : 128]!
        vst1.8          {d0-d1}, [r4, : 128]!
        vst1.8          {d2-d3}, [r4, : 128]!
        vst1.8          d4, [r4, : 64]
-._skipfinalcopy:
+.Lskipfinalcopy:
        add             r1, r1, #1
        cmp             r1, #12
-       blo             ._invertloop
+       blo             .Linvertloop
        add             r1, r3, #144
        ldr             r2, [r1], #4
        ldr             r3, [r1], #4
        add             r8, r8, r10, LSL #12
        mov             r9, r10, LSR #20
        add             r1, r9, r1, LSL #6
-       str             r2, [r0], #4
-       str             r3, [r0], #4
-       str             r4, [r0], #4
-       str             r5, [r0], #4
-       str             r6, [r0], #4
-       str             r7, [r0], #4
-       str             r8, [r0], #4
-       str             r1, [r0]
-       ldrd            r4, [sp, #0]
-       ldrd            r6, [sp, #8]
-       ldrd            r8, [sp, #16]
-       ldrd            r10, [sp, #24]
-       ldr             r12, [sp, #480]
-       ldr             r14, [sp, #484]
-       ldr             r0, =0
-       mov             sp, r12
-       vpop            {q4, q5, q6, q7}
-       bx              lr
+       str             r2, [r0]
+       str             r3, [r0, #4]
+       str             r4, [r0, #8]
+       str             r5, [r0, #12]
+       str             r6, [r0, #16]
+       str             r7, [r0, #20]
+       str             r8, [r0, #24]
+       str             r1, [r0, #28]
+       movw            r0, #0
+       mov             sp, ip
+       pop             {r4-r11, pc}
+ENDPROC(curve25519_neon)
diff --git a/arch/arm/crypto/curve25519-glue.c b/arch/arm/crypto/curve25519-glue.c
new file mode 100644 (file)
index 0000000..2e9e12d
--- /dev/null
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
+ * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been
+ * manually reworked for use in kernel space.
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/internal/kpp.h>
+#include <crypto/internal/simd.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/jump_label.h>
+#include <crypto/curve25519.h>
+
+asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE],
+                               const u8 secret[CURVE25519_KEY_SIZE],
+                               const u8 basepoint[CURVE25519_KEY_SIZE]);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+void curve25519_arch(u8 out[CURVE25519_KEY_SIZE],
+                    const u8 scalar[CURVE25519_KEY_SIZE],
+                    const u8 point[CURVE25519_KEY_SIZE])
+{
+       if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
+               kernel_neon_begin();
+               curve25519_neon(out, scalar, point);
+               kernel_neon_end();
+       } else {
+               curve25519_generic(out, scalar, point);
+       }
+}
+EXPORT_SYMBOL(curve25519_arch);
+
+static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
+                                unsigned int len)
+{
+       u8 *secret = kpp_tfm_ctx(tfm);
+
+       if (!len)
+               curve25519_generate_secret(secret);
+       else if (len == CURVE25519_KEY_SIZE &&
+                crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
+               memcpy(secret, buf, CURVE25519_KEY_SIZE);
+       else
+               return -EINVAL;
+       return 0;
+}
+
+static int curve25519_compute_value(struct kpp_request *req)
+{
+       struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
+       const u8 *secret = kpp_tfm_ctx(tfm);
+       u8 public_key[CURVE25519_KEY_SIZE];
+       u8 buf[CURVE25519_KEY_SIZE];
+       int copied, nbytes;
+       u8 const *bp;
+
+       if (req->src) {
+               copied = sg_copy_to_buffer(req->src,
+                                          sg_nents_for_len(req->src,
+                                                           CURVE25519_KEY_SIZE),
+                                          public_key, CURVE25519_KEY_SIZE);
+               if (copied != CURVE25519_KEY_SIZE)
+                       return -EINVAL;
+               bp = public_key;
+       } else {
+               bp = curve25519_base_point;
+       }
+
+       curve25519_arch(buf, secret, bp);
+
+       /* might want less than we've got */
+       nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
+       copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
+                                                               nbytes),
+                                    buf, nbytes);
+       if (copied != nbytes)
+               return -EINVAL;
+       return 0;
+}
+
+static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
+{
+       return CURVE25519_KEY_SIZE;
+}
+
+static struct kpp_alg curve25519_alg = {
+       .base.cra_name          = "curve25519",
+       .base.cra_driver_name   = "curve25519-neon",
+       .base.cra_priority      = 200,
+       .base.cra_module        = THIS_MODULE,
+       .base.cra_ctxsize       = CURVE25519_KEY_SIZE,
+
+       .set_secret             = curve25519_set_secret,
+       .generate_public_key    = curve25519_compute_value,
+       .compute_shared_secret  = curve25519_compute_value,
+       .max_size               = curve25519_max_size,
+};
+
+static int __init mod_init(void)
+{
+       if (elf_hwcap & HWCAP_NEON) {
+               static_branch_enable(&have_neon);
+               return crypto_register_kpp(&curve25519_alg);
+       }
+       return 0;
+}
+
+static void __exit mod_exit(void)
+{
+       if (elf_hwcap & HWCAP_NEON)
+               crypto_unregister_kpp(&curve25519_alg);
+}
+
+module_init(mod_init);
+module_exit(mod_exit);
+
+MODULE_ALIAS_CRYPTO("curve25519");
+MODULE_ALIAS_CRYPTO("curve25519-neon");
+MODULE_LICENSE("GPL v2");