crypto: arm64/chacha - simplify tail block handling
authorArd Biesheuvel <ardb@kernel.org>
Fri, 6 Nov 2020 16:39:38 +0000 (17:39 +0100)
committerHerbert Xu <herbert@gondor.apana.org.au>
Fri, 13 Nov 2020 09:38:55 +0000 (20:38 +1100)
Based on lessons learnt from optimizing the 32-bit version of this driver,
we can simplify the arm64 version considerably, by reordering the final
two stores when the last block is not a multiple of 64 bytes. This removes
the need to use permutation instructions to calculate the elements that are
clobbered by the final overlapping store, given that the store of the
penultimate block now follows it, and that one carries the correct values
for those elements already.

While at it, simplify the overlapping loads as well, by calculating the
address of the final overlapping load upfront, and switching to this
address for every load that would otherwise extend past the end of the
source buffer.

There is no impact on performance, but the resulting code is substantially
smaller and easier to follow.

Cc: Eric Biggers <ebiggers@google.com>
Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
arch/arm64/crypto/chacha-neon-core.S

index e90386a..b70ac76 100644 (file)
@@ -195,7 +195,6 @@ SYM_FUNC_START(chacha_4block_xor_neon)
        adr_l           x10, .Lpermute
        and             x5, x4, #63
        add             x10, x10, x5
-       add             x11, x10, #64
 
        //
        // This function encrypts four consecutive ChaCha blocks by loading
@@ -645,11 +644,11 @@ CPU_BE(     rev           a15, a15        )
        zip2            v31.4s, v14.4s, v15.4s
          eor           a15, a15, w9
 
-       mov             x3, #64
+       add             x3, x2, x4
+       sub             x3, x3, #128            // start of last block
+
        subs            x5, x4, #128
-       add             x6, x5, x2
-       csel            x3, x3, xzr, ge
-       csel            x2, x2, x6, ge
+       csel            x2, x2, x3, ge
 
        // interleave 64-bit words in state n, n+2
        zip1            v0.2d, v16.2d, v18.2d
@@ -658,13 +657,10 @@ CPU_BE(     rev           a15, a15        )
        zip1            v8.2d, v17.2d, v19.2d
        zip2            v12.2d, v17.2d, v19.2d
          stp           a2, a3, [x1, #-56]
-       ld1             {v16.16b-v19.16b}, [x2], x3
 
        subs            x6, x4, #192
-       ccmp            x3, xzr, #4, lt
-       add             x7, x6, x2
-       csel            x3, x3, xzr, eq
-       csel            x2, x2, x7, eq
+       ld1             {v16.16b-v19.16b}, [x2], #64
+       csel            x2, x2, x3, ge
 
        zip1            v1.2d, v20.2d, v22.2d
        zip2            v5.2d, v20.2d, v22.2d
@@ -672,13 +668,10 @@ CPU_BE(     rev           a15, a15        )
        zip1            v9.2d, v21.2d, v23.2d
        zip2            v13.2d, v21.2d, v23.2d
          stp           a6, a7, [x1, #-40]
-       ld1             {v20.16b-v23.16b}, [x2], x3
 
        subs            x7, x4, #256
-       ccmp            x3, xzr, #4, lt
-       add             x8, x7, x2
-       csel            x3, x3, xzr, eq
-       csel            x2, x2, x8, eq
+       ld1             {v20.16b-v23.16b}, [x2], #64
+       csel            x2, x2, x3, ge
 
        zip1            v2.2d, v24.2d, v26.2d
        zip2            v6.2d, v24.2d, v26.2d
@@ -686,12 +679,10 @@ CPU_BE(     rev           a15, a15        )
        zip1            v10.2d, v25.2d, v27.2d
        zip2            v14.2d, v25.2d, v27.2d
          stp           a10, a11, [x1, #-24]
-       ld1             {v24.16b-v27.16b}, [x2], x3
 
        subs            x8, x4, #320
-       ccmp            x3, xzr, #4, lt
-       add             x9, x8, x2
-       csel            x2, x2, x9, eq
+       ld1             {v24.16b-v27.16b}, [x2], #64
+       csel            x2, x2, x3, ge
 
        zip1            v3.2d, v28.2d, v30.2d
        zip2            v7.2d, v28.2d, v30.2d
@@ -699,151 +690,105 @@ CPU_BE(   rev           a15, a15        )
        zip1            v11.2d, v29.2d, v31.2d
        zip2            v15.2d, v29.2d, v31.2d
          stp           a14, a15, [x1, #-8]
+
+       tbnz            x5, #63, .Lt128
        ld1             {v28.16b-v31.16b}, [x2]
 
        // xor with corresponding input, write to output
-       tbnz            x5, #63, 0f
        eor             v16.16b, v16.16b, v0.16b
        eor             v17.16b, v17.16b, v1.16b
        eor             v18.16b, v18.16b, v2.16b
        eor             v19.16b, v19.16b, v3.16b
-       st1             {v16.16b-v19.16b}, [x1], #64
-       cbz             x5, .Lout
 
-       tbnz            x6, #63, 1f
+       tbnz            x6, #63, .Lt192
+
        eor             v20.16b, v20.16b, v4.16b
        eor             v21.16b, v21.16b, v5.16b
        eor             v22.16b, v22.16b, v6.16b
        eor             v23.16b, v23.16b, v7.16b
-       st1             {v20.16b-v23.16b}, [x1], #64
-       cbz             x6, .Lout
 
-       tbnz            x7, #63, 2f
+       st1             {v16.16b-v19.16b}, [x1], #64
+       tbnz            x7, #63, .Lt256
+
        eor             v24.16b, v24.16b, v8.16b
        eor             v25.16b, v25.16b, v9.16b
        eor             v26.16b, v26.16b, v10.16b
        eor             v27.16b, v27.16b, v11.16b
-       st1             {v24.16b-v27.16b}, [x1], #64
-       cbz             x7, .Lout
 
-       tbnz            x8, #63, 3f
+       st1             {v20.16b-v23.16b}, [x1], #64
+       tbnz            x8, #63, .Lt320
+
        eor             v28.16b, v28.16b, v12.16b
        eor             v29.16b, v29.16b, v13.16b
        eor             v30.16b, v30.16b, v14.16b
        eor             v31.16b, v31.16b, v15.16b
+
+       st1             {v24.16b-v27.16b}, [x1], #64
        st1             {v28.16b-v31.16b}, [x1]
 
 .Lout: frame_pop
        ret
 
-       // fewer than 128 bytes of in/output
-0:     ld1             {v8.16b}, [x10]
-       ld1             {v9.16b}, [x11]
-       movi            v10.16b, #16
-       sub             x2, x1, #64
-       add             x1, x1, x5
-       ld1             {v16.16b-v19.16b}, [x2]
-       tbl             v4.16b, {v0.16b-v3.16b}, v8.16b
-       tbx             v20.16b, {v16.16b-v19.16b}, v9.16b
-       add             v8.16b, v8.16b, v10.16b
-       add             v9.16b, v9.16b, v10.16b
-       tbl             v5.16b, {v0.16b-v3.16b}, v8.16b
-       tbx             v21.16b, {v16.16b-v19.16b}, v9.16b
-       add             v8.16b, v8.16b, v10.16b
-       add             v9.16b, v9.16b, v10.16b
-       tbl             v6.16b, {v0.16b-v3.16b}, v8.16b
-       tbx             v22.16b, {v16.16b-v19.16b}, v9.16b
-       add             v8.16b, v8.16b, v10.16b
-       add             v9.16b, v9.16b, v10.16b
-       tbl             v7.16b, {v0.16b-v3.16b}, v8.16b
-       tbx             v23.16b, {v16.16b-v19.16b}, v9.16b
-
-       eor             v20.16b, v20.16b, v4.16b
-       eor             v21.16b, v21.16b, v5.16b
-       eor             v22.16b, v22.16b, v6.16b
-       eor             v23.16b, v23.16b, v7.16b
-       st1             {v20.16b-v23.16b}, [x1]
-       b               .Lout
-
        // fewer than 192 bytes of in/output
-1:     ld1             {v8.16b}, [x10]
-       ld1             {v9.16b}, [x11]
-       movi            v10.16b, #16
-       add             x1, x1, x6
-       tbl             v0.16b, {v4.16b-v7.16b}, v8.16b
-       tbx             v20.16b, {v16.16b-v19.16b}, v9.16b
-       add             v8.16b, v8.16b, v10.16b
-       add             v9.16b, v9.16b, v10.16b
-       tbl             v1.16b, {v4.16b-v7.16b}, v8.16b
-       tbx             v21.16b, {v16.16b-v19.16b}, v9.16b
-       add             v8.16b, v8.16b, v10.16b
-       add             v9.16b, v9.16b, v10.16b
-       tbl             v2.16b, {v4.16b-v7.16b}, v8.16b
-       tbx             v22.16b, {v16.16b-v19.16b}, v9.16b
-       add             v8.16b, v8.16b, v10.16b
-       add             v9.16b, v9.16b, v10.16b
-       tbl             v3.16b, {v4.16b-v7.16b}, v8.16b
-       tbx             v23.16b, {v16.16b-v19.16b}, v9.16b
-
-       eor             v20.16b, v20.16b, v0.16b
-       eor             v21.16b, v21.16b, v1.16b
-       eor             v22.16b, v22.16b, v2.16b
-       eor             v23.16b, v23.16b, v3.16b
-       st1             {v20.16b-v23.16b}, [x1]
+.Lt192:        cbz             x5, 1f                          // exactly 128 bytes?
+       ld1             {v28.16b-v31.16b}, [x10]
+       add             x5, x5, x1
+       tbl             v28.16b, {v4.16b-v7.16b}, v28.16b
+       tbl             v29.16b, {v4.16b-v7.16b}, v29.16b
+       tbl             v30.16b, {v4.16b-v7.16b}, v30.16b
+       tbl             v31.16b, {v4.16b-v7.16b}, v31.16b
+
+0:     eor             v20.16b, v20.16b, v28.16b
+       eor             v21.16b, v21.16b, v29.16b
+       eor             v22.16b, v22.16b, v30.16b
+       eor             v23.16b, v23.16b, v31.16b
+       st1             {v20.16b-v23.16b}, [x5]         // overlapping stores
+1:     st1             {v16.16b-v19.16b}, [x1]
        b               .Lout
 
+       // fewer than 128 bytes of in/output
+.Lt128:        ld1             {v28.16b-v31.16b}, [x10]
+       add             x5, x5, x1
+       sub             x1, x1, #64
+       tbl             v28.16b, {v0.16b-v3.16b}, v28.16b
+       tbl             v29.16b, {v0.16b-v3.16b}, v29.16b
+       tbl             v30.16b, {v0.16b-v3.16b}, v30.16b
+       tbl             v31.16b, {v0.16b-v3.16b}, v31.16b
+       ld1             {v16.16b-v19.16b}, [x1]         // reload first output block
+       b               0b
+
        // fewer than 256 bytes of in/output
-2:     ld1             {v4.16b}, [x10]
-       ld1             {v5.16b}, [x11]
-       movi            v6.16b, #16
-       add             x1, x1, x7
+.Lt256:        cbz             x6, 2f                          // exactly 192 bytes?
+       ld1             {v4.16b-v7.16b}, [x10]
+       add             x6, x6, x1
        tbl             v0.16b, {v8.16b-v11.16b}, v4.16b
-       tbx             v24.16b, {v20.16b-v23.16b}, v5.16b
-       add             v4.16b, v4.16b, v6.16b
-       add             v5.16b, v5.16b, v6.16b
-       tbl             v1.16b, {v8.16b-v11.16b}, v4.16b
-       tbx             v25.16b, {v20.16b-v23.16b}, v5.16b
-       add             v4.16b, v4.16b, v6.16b
-       add             v5.16b, v5.16b, v6.16b
-       tbl             v2.16b, {v8.16b-v11.16b}, v4.16b
-       tbx             v26.16b, {v20.16b-v23.16b}, v5.16b
-       add             v4.16b, v4.16b, v6.16b
-       add             v5.16b, v5.16b, v6.16b
-       tbl             v3.16b, {v8.16b-v11.16b}, v4.16b
-       tbx             v27.16b, {v20.16b-v23.16b}, v5.16b
-
-       eor             v24.16b, v24.16b, v0.16b
-       eor             v25.16b, v25.16b, v1.16b
-       eor             v26.16b, v26.16b, v2.16b
-       eor             v27.16b, v27.16b, v3.16b
-       st1             {v24.16b-v27.16b}, [x1]
+       tbl             v1.16b, {v8.16b-v11.16b}, v5.16b
+       tbl             v2.16b, {v8.16b-v11.16b}, v6.16b
+       tbl             v3.16b, {v8.16b-v11.16b}, v7.16b
+
+       eor             v28.16b, v28.16b, v0.16b
+       eor             v29.16b, v29.16b, v1.16b
+       eor             v30.16b, v30.16b, v2.16b
+       eor             v31.16b, v31.16b, v3.16b
+       st1             {v28.16b-v31.16b}, [x6]         // overlapping stores
+2:     st1             {v20.16b-v23.16b}, [x1]
        b               .Lout
 
        // fewer than 320 bytes of in/output
-3:     ld1             {v4.16b}, [x10]
-       ld1             {v5.16b}, [x11]
-       movi            v6.16b, #16
-       add             x1, x1, x8
+.Lt320:        cbz             x7, 3f                          // exactly 256 bytes?
+       ld1             {v4.16b-v7.16b}, [x10]
+       add             x7, x7, x1
        tbl             v0.16b, {v12.16b-v15.16b}, v4.16b
-       tbx             v28.16b, {v24.16b-v27.16b}, v5.16b
-       add             v4.16b, v4.16b, v6.16b
-       add             v5.16b, v5.16b, v6.16b
-       tbl             v1.16b, {v12.16b-v15.16b}, v4.16b
-       tbx             v29.16b, {v24.16b-v27.16b}, v5.16b
-       add             v4.16b, v4.16b, v6.16b
-       add             v5.16b, v5.16b, v6.16b
-       tbl             v2.16b, {v12.16b-v15.16b}, v4.16b
-       tbx             v30.16b, {v24.16b-v27.16b}, v5.16b
-       add             v4.16b, v4.16b, v6.16b
-       add             v5.16b, v5.16b, v6.16b
-       tbl             v3.16b, {v12.16b-v15.16b}, v4.16b
-       tbx             v31.16b, {v24.16b-v27.16b}, v5.16b
+       tbl             v1.16b, {v12.16b-v15.16b}, v5.16b
+       tbl             v2.16b, {v12.16b-v15.16b}, v6.16b
+       tbl             v3.16b, {v12.16b-v15.16b}, v7.16b
 
        eor             v28.16b, v28.16b, v0.16b
        eor             v29.16b, v29.16b, v1.16b
        eor             v30.16b, v30.16b, v2.16b
        eor             v31.16b, v31.16b, v3.16b
-       st1             {v28.16b-v31.16b}, [x1]
+       st1             {v28.16b-v31.16b}, [x7]         // overlapping stores
+3:     st1             {v24.16b-v27.16b}, [x1]
        b               .Lout
 SYM_FUNC_END(chacha_4block_xor_neon)
 
@@ -851,7 +796,7 @@ SYM_FUNC_END(chacha_4block_xor_neon)
        .align          L1_CACHE_SHIFT
 .Lpermute:
        .set            .Li, 0
-       .rept           192
+       .rept           128
        .byte           (.Li - 64)
        .set            .Li, .Li + 1
        .endr