crypto: arm64/chacha - fix chacha_4block_xor_neon() for big endian
authorEric Biggers <ebiggers@google.com>
Sat, 23 Feb 2019 06:54:07 +0000 (22:54 -0800)
committerHerbert Xu <herbert@gondor.apana.org.au>
Thu, 28 Feb 2019 06:37:48 +0000 (14:37 +0800)
The change to encrypt a fifth ChaCha block using scalar instructions
caused the chacha20-neon, xchacha20-neon, and xchacha12-neon self-tests
to start failing on big endian arm64 kernels.  The bug is that the
keystream block produced in 32-bit scalar registers is directly XOR'd
with the data words, which are loaded and stored in native endianness.
Thus in big endian mode the data bytes end up XOR'd with the wrong
bytes.  Fix it by byte-swapping the keystream words in big endian mode.

Fixes: 2fe55987b262 ("crypto: arm64/chacha - use combined SIMD/ALU routine for more speed")
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
arch/arm64/crypto/chacha-neon-core.S

index 021bb9e..bfb80e1 100644 (file)
@@ -532,6 +532,10 @@ ENTRY(chacha_4block_xor_neon)
        add             v3.4s, v3.4s, v19.4s
          add           a2, a2, w8
          add           a3, a3, w9
+CPU_BE(          rev           a0, a0          )
+CPU_BE(          rev           a1, a1          )
+CPU_BE(          rev           a2, a2          )
+CPU_BE(          rev           a3, a3          )
 
        ld4r            {v24.4s-v27.4s}, [x0], #16
        ld4r            {v28.4s-v31.4s}, [x0]
@@ -552,6 +556,10 @@ ENTRY(chacha_4block_xor_neon)
        add             v7.4s, v7.4s, v23.4s
          add           a6, a6, w8
          add           a7, a7, w9
+CPU_BE(          rev           a4, a4          )
+CPU_BE(          rev           a5, a5          )
+CPU_BE(          rev           a6, a6          )
+CPU_BE(          rev           a7, a7          )
 
        // x8[0-3] += s2[0]
        // x9[0-3] += s2[1]
@@ -569,6 +577,10 @@ ENTRY(chacha_4block_xor_neon)
        add             v11.4s, v11.4s, v27.4s
          add           a10, a10, w8
          add           a11, a11, w9
+CPU_BE(          rev           a8, a8          )
+CPU_BE(          rev           a9, a9          )
+CPU_BE(          rev           a10, a10        )
+CPU_BE(          rev           a11, a11        )
 
        // x12[0-3] += s3[0]
        // x13[0-3] += s3[1]
@@ -586,6 +598,10 @@ ENTRY(chacha_4block_xor_neon)
        add             v15.4s, v15.4s, v31.4s
          add           a14, a14, w8
          add           a15, a15, w9
+CPU_BE(          rev           a12, a12        )
+CPU_BE(          rev           a13, a13        )
+CPU_BE(          rev           a14, a14        )
+CPU_BE(          rev           a15, a15        )
 
        // interleave 32-bit words in state n, n+1
          ldp           w6, w7, [x2], #64