crypto: arm64 - revert NEON yield for fast AEAD implementations

author Ard Biesheuvel <ard.biesheuvel@linaro.org>

Sun, 29 Jul 2018 14:52:30 +0000 (16:52 +0200)

committer Herbert Xu <herbert@gondor.apana.org.au>

Tue, 7 Aug 2018 09:26:23 +0000 (17:26 +0800)
author Ard Biesheuvel <ard.biesheuvel@linaro.org>
Sun, 29 Jul 2018 14:52:30 +0000 (16:52 +0200)
committer Herbert Xu <herbert@gondor.apana.org.au>
Tue, 7 Aug 2018 09:26:23 +0000 (17:26 +0800)
diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S

index 88f5aef..e3a375c 100644 (file)
--- a/arch/arm64/crypto/aes-ce-ccm-core.S
+++ b/arch/arm64/crypto/aes-ce-ccm-core.S
@@ -19,33 +19,24 @@
          *                           u32 *macp, u8 const rk[], u32 rounds);
          */
  ENTRY(ce_aes_ccm_auth_data)
-       frame_push      7
-
-       mov     x19, x0
-       mov     x20, x1
-       mov     x21, x2
-       mov     x22, x3
-       mov     x23, x4
-       mov     x24, x5
-
-       ldr     w25, [x22]                      /* leftover from prev round? */
+       ldr     w8, [x3]                        /* leftover from prev round? */
         ld1     {v0.16b}, [x0]                  /* load mac */
-       cbz     w25, 1f
-       sub     w25, w25, #16
+       cbz     w8, 1f
+       sub     w8, w8, #16
         eor     v1.16b, v1.16b, v1.16b
-0:     ldrb    w7, [x20], #1                   /* get 1 byte of input */
-       subs    w21, w21, #1
-       add     w25, w25, #1
+0:     ldrb    w7, [x1], #1                    /* get 1 byte of input */
+       subs    w2, w2, #1
+       add     w8, w8, #1
         ins     v1.b[0], w7
         ext     v1.16b, v1.16b, v1.16b, #1      /* rotate in the input bytes */
         beq     8f                              /* out of input? */
-       cbnz    w25, 0b
+       cbnz    w8, 0b
         eor     v0.16b, v0.16b, v1.16b
-1:     ld1     {v3.4s}, [x23]                  /* load first round key */
-       prfm    pldl1strm, [x20]
-       cmp     w24, #12                        /* which key size? */
-       add     x6, x23, #16
-       sub     w7, w24, #2                     /* modified # of rounds */
+1:     ld1     {v3.4s}, [x4]                   /* load first round key */
+       prfm    pldl1strm, [x1]
+       cmp     w5, #12                         /* which key size? */
+       add     x6, x4, #16
+       sub     w7, w5, #2                      /* modified # of rounds */
         bmi     2f
         bne     5f
         mov     v5.16b, v3.16b
@@ -64,43 +55,33 @@ ENTRY(ce_aes_ccm_auth_data)
         ld1     {v5.4s}, [x6], #16              /* load next round key */
         bpl     3b
         aese    v0.16b, v4.16b
-       subs    w21, w21, #16                   /* last data? */
+       subs    w2, w2, #16                     /* last data? */
         eor     v0.16b, v0.16b, v5.16b          /* final round */
         bmi     6f
-       ld1     {v1.16b}, [x20], #16            /* load next input block */
+       ld1     {v1.16b}, [x1], #16             /* load next input block */
         eor     v0.16b, v0.16b, v1.16b          /* xor with mac */
-       beq     6f
-
-       if_will_cond_yield_neon
-       st1     {v0.16b}, [x19]                 /* store mac */
-       do_cond_yield_neon
-       ld1     {v0.16b}, [x19]                 /* reload mac */
-       endif_yield_neon
-
-       b       1b
-6:     st1     {v0.16b}, [x19]                 /* store mac */
+       bne     1b
+6:     st1     {v0.16b}, [x0]                  /* store mac */
         beq     10f
-       adds    w21, w21, #16
+       adds    w2, w2, #16
         beq     10f
-       mov     w25, w21
-7:     ldrb    w7, [x20], #1
+       mov     w8, w2
+7:     ldrb    w7, [x1], #1
         umov    w6, v0.b[0]
         eor     w6, w6, w7
-       strb    w6, [x19], #1
-       subs    w21, w21, #1
+       strb    w6, [x0], #1
+       subs    w2, w2, #1
         beq     10f
         ext     v0.16b, v0.16b, v0.16b, #1      /* rotate out the mac bytes */
         b       7b
-8:     mov     w7, w25
-       add     w25, w25, #16
+8:     mov     w7, w8
+       add     w8, w8, #16
  9:     ext     v1.16b, v1.16b, v1.16b, #1
         adds    w7, w7, #1
         bne     9b
         eor     v0.16b, v0.16b, v1.16b
-       st1     {v0.16b}, [x19]
-10:    str     w25, [x22]
-
-       frame_pop
+       st1     {v0.16b}, [x0]
+10:    str     w8, [x3]
         ret
  ENDPROC(ce_aes_ccm_auth_data)
  
@@ -145,29 +126,19 @@ ENTRY(ce_aes_ccm_final)
  ENDPROC(ce_aes_ccm_final)
  
         .macro  aes_ccm_do_crypt,enc
-       frame_push      8
-
-       mov     x19, x0
-       mov     x20, x1
-       mov     x21, x2
-       mov     x22, x3
-       mov     x23, x4
-       mov     x24, x5
-       mov     x25, x6
-
-       ldr     x26, [x25, #8]                  /* load lower ctr */
-       ld1     {v0.16b}, [x24]                 /* load mac */
-CPU_LE(        rev     x26, x26                )       /* keep swabbed ctr in reg */
+       ldr     x8, [x6, #8]                    /* load lower ctr */
+       ld1     {v0.16b}, [x5]                  /* load mac */
+CPU_LE(        rev     x8, x8                  )       /* keep swabbed ctr in reg */
  0:     /* outer loop */
-       ld1     {v1.8b}, [x25]                  /* load upper ctr */
-       prfm    pldl1strm, [x20]
-       add     x26, x26, #1
-       rev     x9, x26
-       cmp     w23, #12                        /* which key size? */
-       sub     w7, w23, #2                     /* get modified # of rounds */
+       ld1     {v1.8b}, [x6]                   /* load upper ctr */
+       prfm    pldl1strm, [x1]
+       add     x8, x8, #1
+       rev     x9, x8
+       cmp     w4, #12                         /* which key size? */
+       sub     w7, w4, #2                      /* get modified # of rounds */
         ins     v1.d[1], x9                     /* no carry in lower ctr */
-       ld1     {v3.4s}, [x22]                  /* load first round key */
-       add     x10, x22, #16
+       ld1     {v3.4s}, [x3]                   /* load first round key */
+       add     x10, x3, #16
         bmi     1f
         bne     4f
         mov     v5.16b, v3.16b
@@ -194,9 +165,9 @@ CPU_LE(     rev     x26, x26                )       /* keep swabbed ctr in reg */
         bpl     2b
         aese    v0.16b, v4.16b
         aese    v1.16b, v4.16b
-       subs    w21, w21, #16
-       bmi     7f                              /* partial block? */
-       ld1     {v2.16b}, [x20], #16            /* load next input block */
+       subs    w2, w2, #16
+       bmi     6f                              /* partial block? */
+       ld1     {v2.16b}, [x1], #16             /* load next input block */
         .if     \enc == 1
         eor     v2.16b, v2.16b, v5.16b          /* final round enc+mac */
         eor     v1.16b, v1.16b, v2.16b          /* xor with crypted ctr */
@@ -205,29 +176,18 @@ CPU_LE(   rev     x26, x26                )       /* keep swabbed ctr in reg */
         eor     v1.16b, v2.16b, v5.16b          /* final round enc */
         .endif
         eor     v0.16b, v0.16b, v2.16b          /* xor mac with pt ^ rk[last] */
-       st1     {v1.16b}, [x19], #16            /* write output block */
-       beq     5f
-
-       if_will_cond_yield_neon
-       st1     {v0.16b}, [x24]                 /* store mac */
-       do_cond_yield_neon
-       ld1     {v0.16b}, [x24]                 /* reload mac */
-       endif_yield_neon
-
-       b       0b
-5:
-CPU_LE(        rev     x26, x26                        )
-       st1     {v0.16b}, [x24]                 /* store mac */
-       str     x26, [x25, #8]                  /* store lsb end of ctr (BE) */
-
-6:     frame_pop
-       ret
-
-7:     eor     v0.16b, v0.16b, v5.16b          /* final round mac */
+       st1     {v1.16b}, [x0], #16             /* write output block */
+       bne     0b
+CPU_LE(        rev     x8, x8                  )
+       st1     {v0.16b}, [x5]                  /* store mac */
+       str     x8, [x6, #8]                    /* store lsb end of ctr (BE) */
+5:     ret
+
+6:     eor     v0.16b, v0.16b, v5.16b          /* final round mac */
         eor     v1.16b, v1.16b, v5.16b          /* final round enc */
-       st1     {v0.16b}, [x24]                 /* store mac */
-       add     w21, w21, #16                   /* process partial tail block */
-8:     ldrb    w9, [x20], #1                   /* get 1 byte of input */
+       st1     {v0.16b}, [x5]                  /* store mac */
+       add     w2, w2, #16                     /* process partial tail block */
+7:     ldrb    w9, [x1], #1                    /* get 1 byte of input */
         umov    w6, v1.b[0]                     /* get top crypted ctr byte */
         umov    w7, v0.b[0]                     /* get top mac byte */
         .if     \enc == 1
@@ -237,13 +197,13 @@ CPU_LE(   rev     x26, x26                        )
         eor     w9, w9, w6
         eor     w7, w7, w9
         .endif
-       strb    w9, [x19], #1                   /* store out byte */
-       strb    w7, [x24], #1                   /* store mac byte */
-       subs    w21, w21, #1
-       beq     6b
+       strb    w9, [x0], #1                    /* store out byte */
+       strb    w7, [x5], #1                    /* store mac byte */
+       subs    w2, w2, #1
+       beq     5b
         ext     v0.16b, v0.16b, v0.16b, #1      /* shift out mac byte */
         ext     v1.16b, v1.16b, v1.16b, #1      /* shift out ctr byte */
-       b       8b
+       b       7b
         .endm
  
         /*
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S

index dcffb9e..c723647 100644 (file)
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -322,55 +322,41 @@ ENDPROC(pmull_ghash_update_p8)
         .endm
  
         .macro          pmull_gcm_do_crypt, enc
-       frame_push      10
+       ld1             {SHASH.2d}, [x4]
+       ld1             {XL.2d}, [x1]
+       ldr             x8, [x5, #8]                    // load lower counter
  
-       mov             x19, x0
-       mov             x20, x1
-       mov             x21, x2
-       mov             x22, x3
-       mov             x23, x4
-       mov             x24, x5
-       mov             x25, x6
-       mov             x26, x7
-       .if             \enc == 1
-       ldr             x27, [sp, #96]                  // first stacked arg
-       .endif
-
-       ldr             x28, [x24, #8]                  // load lower counter
-CPU_LE(        rev             x28, x28        )
-
-0:     mov             x0, x25
-       load_round_keys w26, x0
-       ld1             {SHASH.2d}, [x23]
-       ld1             {XL.2d}, [x20]
+       load_round_keys w7, x6
  
         movi            MASK.16b, #0xe1
         ext             SHASH2.16b, SHASH.16b, SHASH.16b, #8
+CPU_LE(        rev             x8, x8          )
         shl             MASK.2d, MASK.2d, #57
         eor             SHASH2.16b, SHASH2.16b, SHASH.16b
  
         .if             \enc == 1
-       ld1             {KS.16b}, [x27]
+       ldr             x10, [sp]
+       ld1             {KS.16b}, [x10]
         .endif
  
-1:     ld1             {CTR.8b}, [x24]                 // load upper counter
-       ld1             {INP.16b}, [x22], #16
-       rev             x9, x28
-       add             x28, x28, #1
-       sub             w19, w19, #1
+0:     ld1             {CTR.8b}, [x5]                  // load upper counter
+       ld1             {INP.16b}, [x3], #16
+       rev             x9, x8
+       add             x8, x8, #1
+       sub             w0, w0, #1
         ins             CTR.d[1], x9                    // set lower counter
  
         .if             \enc == 1
         eor             INP.16b, INP.16b, KS.16b        // encrypt input
-       st1             {INP.16b}, [x21], #16
+       st1             {INP.16b}, [x2], #16
         .endif
  
         rev64           T1.16b, INP.16b
  
-       cmp             w26, #12
-       b.ge            4f                              // AES-192/256?
+       cmp             w7, #12
+       b.ge            2f                              // AES-192/256?
  
-2:     enc_round       CTR, v21
+1:     enc_round       CTR, v21
  
         ext             T2.16b, XL.16b, XL.16b, #8
         ext             IN1.16b, T1.16b, T1.16b, #8
@@ -425,39 +411,27 @@ CPU_LE(   rev             x28, x28        )
  
         .if             \enc == 0
         eor             INP.16b, INP.16b, KS.16b
-       st1             {INP.16b}, [x21], #16
+       st1             {INP.16b}, [x2], #16
         .endif
  
-       cbz             w19, 3f
+       cbnz            w0, 0b
  
-       if_will_cond_yield_neon
-       st1             {XL.2d}, [x20]
-       .if             \enc == 1
-       st1             {KS.16b}, [x27]
-       .endif
-       do_cond_yield_neon
-       b               0b
-       endif_yield_neon
+CPU_LE(        rev             x8, x8          )
+       st1             {XL.2d}, [x1]
+       str             x8, [x5, #8]                    // store lower counter
  
-       b               1b
-
-3:     st1             {XL.2d}, [x20]
         .if             \enc == 1
-       st1             {KS.16b}, [x27]
+       st1             {KS.16b}, [x10]
         .endif
  
-CPU_LE(        rev             x28, x28        )
-       str             x28, [x24, #8]                  // store lower counter
-
-       frame_pop
         ret
  
-4:     b.eq            5f                              // AES-192?
+2:     b.eq            3f                              // AES-192?
         enc_round       CTR, v17
         enc_round       CTR, v18
-5:     enc_round       CTR, v19
+3:     enc_round       CTR, v19
         enc_round       CTR, v20
-       b               2b
+       b               1b
         .endm
  
         /*
author	Ard Biesheuvel <ard.biesheuvel@linaro.org>
	Sun, 29 Jul 2018 14:52:30 +0000 (16:52 +0200)
committer	Herbert Xu <herbert@gondor.apana.org.au>
	Tue, 7 Aug 2018 09:26:23 +0000 (17:26 +0800)
arch/arm64/crypto/aes-ce-ccm-core.S		patch \| blob \| history
arch/arm64/crypto/ghash-ce-core.S		patch \| blob \| history