crypto: arm64/crct10dif - implement non-Crypto Extensions alternative
authorArd Biesheuvel <ard.biesheuvel@linaro.org>
Mon, 27 Aug 2018 15:38:12 +0000 (17:38 +0200)
committerHerbert Xu <herbert@gondor.apana.org.au>
Tue, 4 Sep 2018 03:37:04 +0000 (11:37 +0800)
The arm64 implementation of the CRC-T10DIF algorithm uses the 64x64 bit
polynomial multiplication instructions, which are optional in the
architecture, and if these instructions are not available, we fall back
to the C routine which is slow and inefficient.

So let's reuse the 64x64 bit PMULL alternative from the GHASH driver that
uses a sequence of ~40 instructions involving 8x8 bit PMULL and some
shifting and masking. This is a lot slower than the original, but it is
still twice as fast as the current [unoptimized] C code on Cortex-A53,
and it is time invariant and much easier on the D-cache.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
arch/arm64/crypto/crct10dif-ce-core.S
arch/arm64/crypto/crct10dif-ce-glue.c

index a399510..9e82e8e 100644 (file)
 
        vzr             .req    v13
 
+       ad              .req    v14
+       bd              .req    v10
+
+       k00_16          .req    v15
+       k32_48          .req    v16
+
+       t3              .req    v17
+       t4              .req    v18
+       t5              .req    v19
+       t6              .req    v20
+       t7              .req    v21
+       t8              .req    v22
+       t9              .req    v23
+
+       perm1           .req    v24
+       perm2           .req    v25
+       perm3           .req    v26
+       perm4           .req    v27
+
+       bd1             .req    v28
+       bd2             .req    v29
+       bd3             .req    v30
+       bd4             .req    v31
+
+       .macro          __pmull_init_p64
+       .endm
+
+       .macro          __pmull_pre_p64, bd
+       .endm
+
+       .macro          __pmull_init_p8
+       // k00_16 := 0x0000000000000000_000000000000ffff
+       // k32_48 := 0x00000000ffffffff_0000ffffffffffff
+       movi            k32_48.2d, #0xffffffff
+       mov             k32_48.h[2], k32_48.h[0]
+       ushr            k00_16.2d, k32_48.2d, #32
+
+       // prepare the permutation vectors
+       mov_q           x5, 0x080f0e0d0c0b0a09
+       movi            perm4.8b, #8
+       dup             perm1.2d, x5
+       eor             perm1.16b, perm1.16b, perm4.16b
+       ushr            perm2.2d, perm1.2d, #8
+       ushr            perm3.2d, perm1.2d, #16
+       ushr            perm4.2d, perm1.2d, #24
+       sli             perm2.2d, perm1.2d, #56
+       sli             perm3.2d, perm1.2d, #48
+       sli             perm4.2d, perm1.2d, #40
+       .endm
+
+       .macro          __pmull_pre_p8, bd
+       tbl             bd1.16b, {\bd\().16b}, perm1.16b
+       tbl             bd2.16b, {\bd\().16b}, perm2.16b
+       tbl             bd3.16b, {\bd\().16b}, perm3.16b
+       tbl             bd4.16b, {\bd\().16b}, perm4.16b
+       .endm
+
+__pmull_p8_core:
+.L__pmull_p8_core:
+       ext             t4.8b, ad.8b, ad.8b, #1                 // A1
+       ext             t5.8b, ad.8b, ad.8b, #2                 // A2
+       ext             t6.8b, ad.8b, ad.8b, #3                 // A3
+
+       pmull           t4.8h, t4.8b, bd.8b                     // F = A1*B
+       pmull           t8.8h, ad.8b, bd1.8b                    // E = A*B1
+       pmull           t5.8h, t5.8b, bd.8b                     // H = A2*B
+       pmull           t7.8h, ad.8b, bd2.8b                    // G = A*B2
+       pmull           t6.8h, t6.8b, bd.8b                     // J = A3*B
+       pmull           t9.8h, ad.8b, bd3.8b                    // I = A*B3
+       pmull           t3.8h, ad.8b, bd4.8b                    // K = A*B4
+       b               0f
+
+.L__pmull_p8_core2:
+       tbl             t4.16b, {ad.16b}, perm1.16b             // A1
+       tbl             t5.16b, {ad.16b}, perm2.16b             // A2
+       tbl             t6.16b, {ad.16b}, perm3.16b             // A3
+
+       pmull2          t4.8h, t4.16b, bd.16b                   // F = A1*B
+       pmull2          t8.8h, ad.16b, bd1.16b                  // E = A*B1
+       pmull2          t5.8h, t5.16b, bd.16b                   // H = A2*B
+       pmull2          t7.8h, ad.16b, bd2.16b                  // G = A*B2
+       pmull2          t6.8h, t6.16b, bd.16b                   // J = A3*B
+       pmull2          t9.8h, ad.16b, bd3.16b                  // I = A*B3
+       pmull2          t3.8h, ad.16b, bd4.16b                  // K = A*B4
+
+0:     eor             t4.16b, t4.16b, t8.16b                  // L = E + F
+       eor             t5.16b, t5.16b, t7.16b                  // M = G + H
+       eor             t6.16b, t6.16b, t9.16b                  // N = I + J
+
+       uzp1            t8.2d, t4.2d, t5.2d
+       uzp2            t4.2d, t4.2d, t5.2d
+       uzp1            t7.2d, t6.2d, t3.2d
+       uzp2            t6.2d, t6.2d, t3.2d
+
+       // t4 = (L) (P0 + P1) << 8
+       // t5 = (M) (P2 + P3) << 16
+       eor             t8.16b, t8.16b, t4.16b
+       and             t4.16b, t4.16b, k32_48.16b
+
+       // t6 = (N) (P4 + P5) << 24
+       // t7 = (K) (P6 + P7) << 32
+       eor             t7.16b, t7.16b, t6.16b
+       and             t6.16b, t6.16b, k00_16.16b
+
+       eor             t8.16b, t8.16b, t4.16b
+       eor             t7.16b, t7.16b, t6.16b
+
+       zip2            t5.2d, t8.2d, t4.2d
+       zip1            t4.2d, t8.2d, t4.2d
+       zip2            t3.2d, t7.2d, t6.2d
+       zip1            t6.2d, t7.2d, t6.2d
+
+       ext             t4.16b, t4.16b, t4.16b, #15
+       ext             t5.16b, t5.16b, t5.16b, #14
+       ext             t6.16b, t6.16b, t6.16b, #13
+       ext             t3.16b, t3.16b, t3.16b, #12
+
+       eor             t4.16b, t4.16b, t5.16b
+       eor             t6.16b, t6.16b, t3.16b
+       ret
+ENDPROC(__pmull_p8_core)
+
+       .macro          __pmull_p8, rq, ad, bd, i
+       .ifnc           \bd, v10
+       .err
+       .endif
+       mov             ad.16b, \ad\().16b
+       .ifb            \i
+       pmull           \rq\().8h, \ad\().8b, bd.8b             // D = A*B
+       .else
+       pmull2          \rq\().8h, \ad\().16b, bd.16b           // D = A*B
+       .endif
+
+       bl              .L__pmull_p8_core\i
+
+       eor             \rq\().16b, \rq\().16b, t4.16b
+       eor             \rq\().16b, \rq\().16b, t6.16b
+       .endm
+
        .macro          fold64, p, reg1, reg2
        ldp             q11, q12, [arg2], #0x20
 
@@ -106,6 +245,7 @@ CPU_LE(     ext             v12.16b, v12.16b, v12.16b, #8   )
        __pmull_\p      \reg, \reg, v10, 2
        .ifnb           \rk
        ldr_l           q10, \rk, x8
+       __pmull_pre_\p  v10
        .endif
        eor             v7.16b, v7.16b, v8.16b
        eor             v7.16b, v7.16b, \reg\().16b
@@ -128,6 +268,8 @@ CPU_LE(     ext             v12.16b, v12.16b, v12.16b, #8   )
 
        movi            vzr.16b, #0             // init zero register
 
+       __pmull_init_\p
+
        // adjust the 16-bit initial_crc value, scale it to 32 bits
        lsl             arg1_low32, arg1_low32, #16
 
@@ -176,6 +318,7 @@ CPU_LE(     ext             v7.16b, v7.16b, v7.16b, #8      )
        ldr_l           q10, rk3, x8    // xmm10 has rk3 and rk4
                                        // type of pmull instruction
                                        // will determine which constant to use
+       __pmull_pre_\p  v10
 
        //
        // we subtract 256 instead of 128 to save one instruction from the loop
@@ -212,6 +355,8 @@ CPU_LE(     ext             v7.16b, v7.16b, v7.16b, #8      )
        ldp             q6, q7, [sp, #.Lframe_local_offset + 96]
        ldr_l           q10, rk3, x8
        movi            vzr.16b, #0             // init zero register
+       __pmull_init_\p
+       __pmull_pre_\p  v10
        endif_yield_neon
 
        b               .L_fold_64_B_loop_\@
@@ -225,6 +370,7 @@ CPU_LE(     ext             v7.16b, v7.16b, v7.16b, #8      )
        // constants
 
        ldr_l           q10, rk9, x8
+       __pmull_pre_\p  v10
 
        fold16          \p, v0, rk11
        fold16          \p, v1, rk13
@@ -306,6 +452,7 @@ CPU_LE(     ext             v1.16b, v1.16b, v1.16b, #8      )
 .L_128_done_\@:
        // compute crc of a 128-bit value
        ldr_l           q10, rk5, x8            // rk5 and rk6 in xmm10
+       __pmull_pre_\p  v10
 
        // 64b fold
        ext             v0.16b, vzr.16b, v7.16b, #8
@@ -321,6 +468,7 @@ CPU_LE(     ext             v1.16b, v1.16b, v1.16b, #8      )
 
        // barrett reduction
        ldr_l           q10, rk7, x8
+       __pmull_pre_\p  v10
        mov             v0.d[0], v7.d[1]
 
        __pmull_\p      v0, v0, v10
@@ -352,6 +500,7 @@ CPU_LE(     ext             v7.16b, v7.16b, v7.16b, #8      )
        b.lt            .L_less_than_16_left_\@
 
        ldr_l           q10, rk1, x8            // rk1 and rk2 in xmm10
+       __pmull_pre_\p  v10
 
        // update the counter. subtract 32 instead of 16 to save one
        // instruction from the loop
@@ -372,6 +521,11 @@ CPU_LE(    ext             v7.16b, v7.16b, v7.16b, #8      )
        b               .L_128_done_\@
        .endm
 
+ENTRY(crc_t10dif_pmull_p8)
+       crc_t10dif_pmull        p8
+ENDPROC(crc_t10dif_pmull_p8)
+
+       .align          5
 ENTRY(crc_t10dif_pmull_p64)
        crc_t10dif_pmull        p64
 ENDPROC(crc_t10dif_pmull_p64)
index 343a1e9..b461d62 100644 (file)
@@ -23,6 +23,7 @@
 #define CRC_T10DIF_PMULL_CHUNK_SIZE    16U
 
 asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 buf[], u64 len);
+asmlinkage u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 buf[], u64 len);
 
 static u16 (*crc_t10dif_pmull)(u16 init_crc, const u8 buf[], u64 len);
 
@@ -87,7 +88,10 @@ static struct shash_alg crc_t10dif_alg = {
 
 static int __init crc_t10dif_mod_init(void)
 {
-       crc_t10dif_pmull = crc_t10dif_pmull_p64;
+       if (elf_hwcap & HWCAP_PMULL)
+               crc_t10dif_pmull = crc_t10dif_pmull_p64;
+       else
+               crc_t10dif_pmull = crc_t10dif_pmull_p8;
 
        return crypto_register_shash(&crc_t10dif_alg);
 }
@@ -97,8 +101,10 @@ static void __exit crc_t10dif_mod_exit(void)
        crypto_unregister_shash(&crc_t10dif_alg);
 }
 
-module_cpu_feature_match(PMULL, crc_t10dif_mod_init);
+module_cpu_feature_match(ASIMD, crc_t10dif_mod_init);
 module_exit(crc_t10dif_mod_exit);
 
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("crct10dif");
+MODULE_ALIAS_CRYPTO("crct10dif-arm64-ce");