Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
[platform/kernel/linux-starfive.git] / arch / arm64 / crypto / aes-modes.S
index 2883def..324039b 100644 (file)
        .text
        .align          4
 
+#ifndef MAX_STRIDE
+#define MAX_STRIDE     4
+#endif
+
+#if MAX_STRIDE == 4
+#define ST4(x...) x
+#define ST5(x...)
+#else
+#define ST4(x...)
+#define ST5(x...) x
+#endif
+
 aes_encrypt_block4x:
        encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
        ret
@@ -20,6 +32,18 @@ aes_decrypt_block4x:
        ret
 ENDPROC(aes_decrypt_block4x)
 
+#if MAX_STRIDE == 5
+aes_encrypt_block5x:
+       encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
+       ret
+ENDPROC(aes_encrypt_block5x)
+
+aes_decrypt_block5x:
+       decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
+       ret
+ENDPROC(aes_decrypt_block5x)
+#endif
+
        /*
         * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
         *                 int blocks)
@@ -34,14 +58,17 @@ AES_ENTRY(aes_ecb_encrypt)
        enc_prepare     w3, x2, x5
 
 .LecbencloopNx:
-       subs            w4, w4, #4
+       subs            w4, w4, #MAX_STRIDE
        bmi             .Lecbenc1x
        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
-       bl              aes_encrypt_block4x
+ST4(   bl              aes_encrypt_block4x             )
+ST5(   ld1             {v4.16b}, [x1], #16             )
+ST5(   bl              aes_encrypt_block5x             )
        st1             {v0.16b-v3.16b}, [x0], #64
+ST5(   st1             {v4.16b}, [x0], #16             )
        b               .LecbencloopNx
 .Lecbenc1x:
-       adds            w4, w4, #4
+       adds            w4, w4, #MAX_STRIDE
        beq             .Lecbencout
 .Lecbencloop:
        ld1             {v0.16b}, [x1], #16             /* get next pt block */
@@ -62,14 +89,17 @@ AES_ENTRY(aes_ecb_decrypt)
        dec_prepare     w3, x2, x5
 
 .LecbdecloopNx:
-       subs            w4, w4, #4
+       subs            w4, w4, #MAX_STRIDE
        bmi             .Lecbdec1x
        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
-       bl              aes_decrypt_block4x
+ST4(   bl              aes_decrypt_block4x             )
+ST5(   ld1             {v4.16b}, [x1], #16             )
+ST5(   bl              aes_decrypt_block5x             )
        st1             {v0.16b-v3.16b}, [x0], #64
+ST5(   st1             {v4.16b}, [x0], #16             )
        b               .LecbdecloopNx
 .Lecbdec1x:
-       adds            w4, w4, #4
+       adds            w4, w4, #MAX_STRIDE
        beq             .Lecbdecout
 .Lecbdecloop:
        ld1             {v0.16b}, [x1], #16             /* get next ct block */
@@ -129,39 +159,56 @@ AES_ENTRY(aes_cbc_decrypt)
        stp             x29, x30, [sp, #-16]!
        mov             x29, sp
 
-       ld1             {v7.16b}, [x5]                  /* get iv */
+       ld1             {cbciv.16b}, [x5]               /* get iv */
        dec_prepare     w3, x2, x6
 
 .LcbcdecloopNx:
-       subs            w4, w4, #4
+       subs            w4, w4, #MAX_STRIDE
        bmi             .Lcbcdec1x
        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
+#if MAX_STRIDE == 5
+       ld1             {v4.16b}, [x1], #16             /* get 1 ct block */
+       mov             v5.16b, v0.16b
+       mov             v6.16b, v1.16b
+       mov             v7.16b, v2.16b
+       bl              aes_decrypt_block5x
+       sub             x1, x1, #32
+       eor             v0.16b, v0.16b, cbciv.16b
+       eor             v1.16b, v1.16b, v5.16b
+       ld1             {v5.16b}, [x1], #16             /* reload 1 ct block */
+       ld1             {cbciv.16b}, [x1], #16          /* reload 1 ct block */
+       eor             v2.16b, v2.16b, v6.16b
+       eor             v3.16b, v3.16b, v7.16b
+       eor             v4.16b, v4.16b, v5.16b
+#else
        mov             v4.16b, v0.16b
        mov             v5.16b, v1.16b
        mov             v6.16b, v2.16b
        bl              aes_decrypt_block4x
        sub             x1, x1, #16
-       eor             v0.16b, v0.16b, v7.16b
+       eor             v0.16b, v0.16b, cbciv.16b
        eor             v1.16b, v1.16b, v4.16b
-       ld1             {v7.16b}, [x1], #16             /* reload 1 ct block */
+       ld1             {cbciv.16b}, [x1], #16          /* reload 1 ct block */
        eor             v2.16b, v2.16b, v5.16b
        eor             v3.16b, v3.16b, v6.16b
+#endif
        st1             {v0.16b-v3.16b}, [x0], #64
+ST5(   st1             {v4.16b}, [x0], #16             )
        b               .LcbcdecloopNx
 .Lcbcdec1x:
-       adds            w4, w4, #4
+       adds            w4, w4, #MAX_STRIDE
        beq             .Lcbcdecout
 .Lcbcdecloop:
        ld1             {v1.16b}, [x1], #16             /* get next ct block */
        mov             v0.16b, v1.16b                  /* ...and copy to v0 */
        decrypt_block   v0, w3, x2, x6, w7
-       eor             v0.16b, v0.16b, v7.16b          /* xor with iv => pt */
-       mov             v7.16b, v1.16b                  /* ct is next iv */
+       eor             v0.16b, v0.16b, cbciv.16b       /* xor with iv => pt */
+       mov             cbciv.16b, v1.16b               /* ct is next iv */
        st1             {v0.16b}, [x0], #16
        subs            w4, w4, #1
        bne             .Lcbcdecloop
 .Lcbcdecout:
-       st1             {v7.16b}, [x5]                  /* return iv */
+       st1             {cbciv.16b}, [x5]               /* return iv */
        ldp             x29, x30, [sp], #16
        ret
 AES_ENDPROC(aes_cbc_decrypt)
@@ -255,51 +302,60 @@ AES_ENTRY(aes_ctr_encrypt)
        mov             x29, sp
 
        enc_prepare     w3, x2, x6
-       ld1             {v4.16b}, [x5]
+       ld1             {vctr.16b}, [x5]
 
-       umov            x6, v4.d[1]             /* keep swabbed ctr in reg */
+       umov            x6, vctr.d[1]           /* keep swabbed ctr in reg */
        rev             x6, x6
        cmn             w6, w4                  /* 32 bit overflow? */
        bcs             .Lctrloop
 .LctrloopNx:
-       subs            w4, w4, #4
+       subs            w4, w4, #MAX_STRIDE
        bmi             .Lctr1x
        add             w7, w6, #1
-       mov             v0.16b, v4.16b
+       mov             v0.16b, vctr.16b
        add             w8, w6, #2
-       mov             v1.16b, v4.16b
+       mov             v1.16b, vctr.16b
+       add             w9, w6, #3
+       mov             v2.16b, vctr.16b
        add             w9, w6, #3
-       mov             v2.16b, v4.16b
        rev             w7, w7
-       mov             v3.16b, v4.16b
+       mov             v3.16b, vctr.16b
        rev             w8, w8
+ST5(   mov             v4.16b, vctr.16b                )
        mov             v1.s[3], w7
        rev             w9, w9
+ST5(   add             w10, w6, #4                     )
        mov             v2.s[3], w8
+ST5(   rev             w10, w10                        )
        mov             v3.s[3], w9
+ST5(   mov             v4.s[3], w10                    )
        ld1             {v5.16b-v7.16b}, [x1], #48      /* get 3 input blocks */
-       bl              aes_encrypt_block4x
+ST4(   bl              aes_encrypt_block4x             )
+ST5(   bl              aes_encrypt_block5x             )
        eor             v0.16b, v5.16b, v0.16b
-       ld1             {v5.16b}, [x1], #16             /* get 1 input block  */
+ST4(   ld1             {v5.16b}, [x1], #16             )
        eor             v1.16b, v6.16b, v1.16b
+ST5(   ld1             {v5.16b-v6.16b}, [x1], #32      )
        eor             v2.16b, v7.16b, v2.16b
        eor             v3.16b, v5.16b, v3.16b
+ST5(   eor             v4.16b, v6.16b, v4.16b          )
        st1             {v0.16b-v3.16b}, [x0], #64
-       add             x6, x6, #4
+ST5(   st1             {v4.16b}, [x0], #16             )
+       add             x6, x6, #MAX_STRIDE
        rev             x7, x6
-       ins             v4.d[1], x7
+       ins             vctr.d[1], x7
        cbz             w4, .Lctrout
        b               .LctrloopNx
 .Lctr1x:
-       adds            w4, w4, #4
+       adds            w4, w4, #MAX_STRIDE
        beq             .Lctrout
 .Lctrloop:
-       mov             v0.16b, v4.16b
+       mov             v0.16b, vctr.16b
        encrypt_block   v0, w3, x2, x8, w7
 
        adds            x6, x6, #1              /* increment BE ctr */
        rev             x7, x6
-       ins             v4.d[1], x7
+       ins             vctr.d[1], x7
        bcs             .Lctrcarry              /* overflow? */
 
 .Lctrcarrydone:
@@ -311,7 +367,7 @@ AES_ENTRY(aes_ctr_encrypt)
        bne             .Lctrloop
 
 .Lctrout:
-       st1             {v4.16b}, [x5]          /* return next CTR value */
+       st1             {vctr.16b}, [x5]        /* return next CTR value */
        ldp             x29, x30, [sp], #16
        ret
 
@@ -320,11 +376,11 @@ AES_ENTRY(aes_ctr_encrypt)
        b               .Lctrout
 
 .Lctrcarry:
-       umov            x7, v4.d[0]             /* load upper word of ctr  */
+       umov            x7, vctr.d[0]           /* load upper word of ctr  */
        rev             x7, x7                  /* ... to handle the carry */
        add             x7, x7, #1
        rev             x7, x7
-       ins             v4.d[0], x7
+       ins             vctr.d[0], x7
        b               .Lctrcarrydone
 AES_ENDPROC(aes_ctr_encrypt)