crypto: powerpc/aes - ECB/CBC/CTR/XTS modes
authorMarkus Stockhausen <stockhausen@collogia.de>
Sun, 22 Feb 2015 09:00:00 +0000 (10:00 +0100)
committerHerbert Xu <herbert@gondor.apana.org.au>
Sun, 1 Mar 2015 10:02:28 +0000 (23:02 +1300)
The assembler block cipher module that controls the core
AES functions.

Signed-off-by: Markus Stockhausen <stockhausen@collogia.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
arch/powerpc/crypto/aes-spe-modes.S [new file with mode: 0644]

diff --git a/arch/powerpc/crypto/aes-spe-modes.S b/arch/powerpc/crypto/aes-spe-modes.S
new file mode 100644 (file)
index 0000000..ad48032
--- /dev/null
@@ -0,0 +1,630 @@
+/*
+ * AES modes (ECB/CBC/CTR/XTS) for PPC AES implementation
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <asm/ppc_asm.h>
+#include "aes-spe-regs.h"
+
+#ifdef __BIG_ENDIAN__                  /* Macros for big endian builds */
+
+#define LOAD_DATA(reg, off) \
+       lwz             reg,off(rSP);   /* load with offset             */
+#define SAVE_DATA(reg, off) \
+       stw             reg,off(rDP);   /* save with offset             */
+#define NEXT_BLOCK \
+       addi            rSP,rSP,16;     /* increment pointers per bloc  */ \
+       addi            rDP,rDP,16;
+#define LOAD_IV(reg, off) \
+       lwz             reg,off(rIP);   /* IV loading with offset       */
+#define SAVE_IV(reg, off) \
+       stw             reg,off(rIP);   /* IV saving with offset        */
+#define START_IV                       /* nothing to reset             */
+#define CBC_DEC 16                     /* CBC decrement per block      */
+#define CTR_DEC 1                      /* CTR decrement one byte       */
+
+#else                                  /* Macros for little endian     */
+
+#define LOAD_DATA(reg, off) \
+       lwbrx           reg,0,rSP;      /* load reversed                */ \
+       addi            rSP,rSP,4;      /* and increment pointer        */
+#define SAVE_DATA(reg, off) \
+       stwbrx          reg,0,rDP;      /* save reversed                */ \
+       addi            rDP,rDP,4;      /* and increment pointer        */
+#define NEXT_BLOCK                     /* nothing todo                 */
+#define LOAD_IV(reg, off) \
+       lwbrx           reg,0,rIP;      /* load reversed                */ \
+       addi            rIP,rIP,4;      /* and increment pointer        */
+#define SAVE_IV(reg, off) \
+       stwbrx          reg,0,rIP;      /* load reversed                */ \
+       addi            rIP,rIP,4;      /* and increment pointer        */
+#define START_IV \
+       subi            rIP,rIP,16;     /* must reset pointer           */
+#define CBC_DEC 32                     /* 2 blocks because of incs     */
+#define CTR_DEC 17                     /* 1 block because of incs      */
+
+#endif
+
+#define SAVE_0_REGS
+#define LOAD_0_REGS
+
+#define SAVE_4_REGS \
+       stw             rI0,96(r1);     /* save 32 bit registers        */ \
+       stw             rI1,100(r1);                                       \
+       stw             rI2,104(r1);                                       \
+       stw             rI3,108(r1);
+
+#define LOAD_4_REGS \
+       lwz             rI0,96(r1);     /* restore 32 bit registers     */ \
+       lwz             rI1,100(r1);                                       \
+       lwz             rI2,104(r1);                                       \
+       lwz             rI3,108(r1);
+
+#define SAVE_8_REGS \
+       SAVE_4_REGS                                                        \
+       stw             rG0,112(r1);    /* save 32 bit registers        */ \
+       stw             rG1,116(r1);                                       \
+       stw             rG2,120(r1);                                       \
+       stw             rG3,124(r1);
+
+#define LOAD_8_REGS \
+       LOAD_4_REGS                                                        \
+       lwz             rG0,112(r1);    /* restore 32 bit registers     */ \
+       lwz             rG1,116(r1);                                       \
+       lwz             rG2,120(r1);                                       \
+       lwz             rG3,124(r1);
+
+#define INITIALIZE_CRYPT(tab,nr32bitregs) \
+       mflr            r0;                                                \
+       stwu            r1,-160(r1);    /* create stack frame           */ \
+       lis             rT0,tab@h;      /* en-/decryption table pointer */ \
+       stw             r0,8(r1);       /* save link register           */ \
+       ori             rT0,rT0,tab@l;                                     \
+       evstdw          r14,16(r1);                                        \
+       mr              rKS,rKP;                                           \
+       evstdw          r15,24(r1);     /* We must save non volatile    */ \
+       evstdw          r16,32(r1);     /* registers. Take the chance   */ \
+       evstdw          r17,40(r1);     /* and save the SPE part too    */ \
+       evstdw          r18,48(r1);                                        \
+       evstdw          r19,56(r1);                                        \
+       evstdw          r20,64(r1);                                        \
+       evstdw          r21,72(r1);                                        \
+       evstdw          r22,80(r1);                                        \
+       evstdw          r23,88(r1);                                        \
+       SAVE_##nr32bitregs##_REGS
+
+#define FINALIZE_CRYPT(nr32bitregs) \
+       lwz             r0,8(r1);                                          \
+       evldw           r14,16(r1);     /* restore SPE registers        */ \
+       evldw           r15,24(r1);                                        \
+       evldw           r16,32(r1);                                        \
+       evldw           r17,40(r1);                                        \
+       evldw           r18,48(r1);                                        \
+       evldw           r19,56(r1);                                        \
+       evldw           r20,64(r1);                                        \
+       evldw           r21,72(r1);                                        \
+       evldw           r22,80(r1);                                        \
+       evldw           r23,88(r1);                                        \
+       LOAD_##nr32bitregs##_REGS                                          \
+       mtlr            r0;             /* restore link register        */ \
+       xor             r0,r0,r0;                                          \
+       stw             r0,16(r1);      /* delete sensitive data        */ \
+       stw             r0,24(r1);      /* that we might have pushed    */ \
+       stw             r0,32(r1);      /* from other context that runs */ \
+       stw             r0,40(r1);      /* the same code                */ \
+       stw             r0,48(r1);                                         \
+       stw             r0,56(r1);                                         \
+       stw             r0,64(r1);                                         \
+       stw             r0,72(r1);                                         \
+       stw             r0,80(r1);                                         \
+       stw             r0,88(r1);                                         \
+       addi            r1,r1,160;      /* cleanup stack frame          */
+
+#define ENDIAN_SWAP(t0, t1, s0, s1) \
+       rotrwi          t0,s0,8;        /* swap endianness for 2 GPRs   */ \
+       rotrwi          t1,s1,8;                                           \
+       rlwimi          t0,s0,8,8,15;                                      \
+       rlwimi          t1,s1,8,8,15;                                      \
+       rlwimi          t0,s0,8,24,31;                                     \
+       rlwimi          t1,s1,8,24,31;
+
+#define GF128_MUL(d0, d1, d2, d3, t0) \
+       li              t0,0x87;        /* multiplication in GF128      */ \
+       cmpwi           d3,-1;                                             \
+       iselgt          t0,0,t0;                                           \
+       rlwimi          d3,d2,0,0,0;    /* propagate "carry" bits       */ \
+       rotlwi          d3,d3,1;                                           \
+       rlwimi          d2,d1,0,0,0;                                       \
+       rotlwi          d2,d2,1;                                           \
+       rlwimi          d1,d0,0,0,0;                                       \
+       slwi            d0,d0,1;        /* shift left 128 bit           */ \
+       rotlwi          d1,d1,1;                                           \
+       xor             d0,d0,t0;
+
+#define START_KEY(d0, d1, d2, d3) \
+       lwz             rW0,0(rKP);                                        \
+       mtctr           rRR;                                               \
+       lwz             rW1,4(rKP);                                        \
+       lwz             rW2,8(rKP);                                        \
+       lwz             rW3,12(rKP);                                       \
+       xor             rD0,d0,rW0;                                        \
+       xor             rD1,d1,rW1;                                        \
+       xor             rD2,d2,rW2;                                        \
+       xor             rD3,d3,rW3;
+
+/*
+ * ppc_encrypt_aes(u8 *out, const u8 *in, u32 *key_enc,
+ *                u32 rounds)
+ *
+ * called from glue layer to encrypt a single 16 byte block
+ * round values are AES128 = 4, AES192 = 5, AES256 = 6
+ *
+ */
+_GLOBAL(ppc_encrypt_aes)
+       INITIALIZE_CRYPT(PPC_AES_4K_ENCTAB, 0)
+       LOAD_DATA(rD0, 0)
+       LOAD_DATA(rD1, 4)
+       LOAD_DATA(rD2, 8)
+       LOAD_DATA(rD3, 12)
+       START_KEY(rD0, rD1, rD2, rD3)
+       bl              ppc_encrypt_block
+       xor             rD0,rD0,rW0
+       SAVE_DATA(rD0, 0)
+       xor             rD1,rD1,rW1
+       SAVE_DATA(rD1, 4)
+       xor             rD2,rD2,rW2
+       SAVE_DATA(rD2, 8)
+       xor             rD3,rD3,rW3
+       SAVE_DATA(rD3, 12)
+       FINALIZE_CRYPT(0)
+       blr
+
+/*
+ * ppc_decrypt_aes(u8 *out, const u8 *in, u32 *key_dec,
+ *                u32 rounds)
+ *
+ * called from glue layer to decrypt a single 16 byte block
+ * round values are AES128 = 4, AES192 = 5, AES256 = 6
+ *
+ */
+_GLOBAL(ppc_decrypt_aes)
+       INITIALIZE_CRYPT(PPC_AES_4K_DECTAB,0)
+       LOAD_DATA(rD0, 0)
+       addi            rT1,rT0,4096
+       LOAD_DATA(rD1, 4)
+       LOAD_DATA(rD2, 8)
+       LOAD_DATA(rD3, 12)
+       START_KEY(rD0, rD1, rD2, rD3)
+       bl              ppc_decrypt_block
+       xor             rD0,rD0,rW0
+       SAVE_DATA(rD0, 0)
+       xor             rD1,rD1,rW1
+       SAVE_DATA(rD1, 4)
+       xor             rD2,rD2,rW2
+       SAVE_DATA(rD2, 8)
+       xor             rD3,rD3,rW3
+       SAVE_DATA(rD3, 12)
+       FINALIZE_CRYPT(0)
+       blr
+
+/*
+ * ppc_encrypt_ecb(u8 *out, const u8 *in, u32 *key_enc,
+ *                u32 rounds, u32 bytes);
+ *
+ * called from glue layer to encrypt multiple blocks via ECB
+ * Bytes must be larger or equal 16 and only whole blocks are
+ * processed. round values are AES128 = 4, AES192 = 5 and
+ * AES256 = 6
+ *
+ */
+_GLOBAL(ppc_encrypt_ecb)
+       INITIALIZE_CRYPT(PPC_AES_4K_ENCTAB, 0)
+ppc_encrypt_ecb_loop:
+       LOAD_DATA(rD0, 0)
+       mr              rKP,rKS
+       LOAD_DATA(rD1, 4)
+       subi            rLN,rLN,16
+       LOAD_DATA(rD2, 8)
+       cmpwi           rLN,15
+       LOAD_DATA(rD3, 12)
+       START_KEY(rD0, rD1, rD2, rD3)
+       bl              ppc_encrypt_block
+       xor             rD0,rD0,rW0
+       SAVE_DATA(rD0, 0)
+       xor             rD1,rD1,rW1
+       SAVE_DATA(rD1, 4)
+       xor             rD2,rD2,rW2
+       SAVE_DATA(rD2, 8)
+       xor             rD3,rD3,rW3
+       SAVE_DATA(rD3, 12)
+       NEXT_BLOCK
+       bt              gt,ppc_encrypt_ecb_loop
+       FINALIZE_CRYPT(0)
+       blr
+
+/*
+ * ppc_decrypt_ecb(u8 *out, const u8 *in, u32 *key_dec,
+ *                u32 rounds, u32 bytes);
+ *
+ * called from glue layer to decrypt multiple blocks via ECB
+ * Bytes must be larger or equal 16 and only whole blocks are
+ * processed. round values are AES128 = 4, AES192 = 5 and
+ * AES256 = 6
+ *
+ */
+_GLOBAL(ppc_decrypt_ecb)
+       INITIALIZE_CRYPT(PPC_AES_4K_DECTAB, 0)
+       addi            rT1,rT0,4096
+ppc_decrypt_ecb_loop:
+       LOAD_DATA(rD0, 0)
+       mr              rKP,rKS
+       LOAD_DATA(rD1, 4)
+       subi            rLN,rLN,16
+       LOAD_DATA(rD2, 8)
+       cmpwi           rLN,15
+       LOAD_DATA(rD3, 12)
+       START_KEY(rD0, rD1, rD2, rD3)
+       bl              ppc_decrypt_block
+       xor             rD0,rD0,rW0
+       SAVE_DATA(rD0, 0)
+       xor             rD1,rD1,rW1
+       SAVE_DATA(rD1, 4)
+       xor             rD2,rD2,rW2
+       SAVE_DATA(rD2, 8)
+       xor             rD3,rD3,rW3
+       SAVE_DATA(rD3, 12)
+       NEXT_BLOCK
+       bt              gt,ppc_decrypt_ecb_loop
+       FINALIZE_CRYPT(0)
+       blr
+
+/*
+ * ppc_encrypt_cbc(u8 *out, const u8 *in, u32 *key_enc,
+ *                32 rounds, u32 bytes, u8 *iv);
+ *
+ * called from glue layer to encrypt multiple blocks via CBC
+ * Bytes must be larger or equal 16 and only whole blocks are
+ * processed. round values are AES128 = 4, AES192 = 5 and
+ * AES256 = 6
+ *
+ */
+_GLOBAL(ppc_encrypt_cbc)
+       INITIALIZE_CRYPT(PPC_AES_4K_ENCTAB, 4)
+       LOAD_IV(rI0, 0)
+       LOAD_IV(rI1, 4)
+       LOAD_IV(rI2, 8)
+       LOAD_IV(rI3, 12)
+ppc_encrypt_cbc_loop:
+       LOAD_DATA(rD0, 0)
+       mr              rKP,rKS
+       LOAD_DATA(rD1, 4)
+       subi            rLN,rLN,16
+       LOAD_DATA(rD2, 8)
+       cmpwi           rLN,15
+       LOAD_DATA(rD3, 12)
+       xor             rD0,rD0,rI0
+       xor             rD1,rD1,rI1
+       xor             rD2,rD2,rI2
+       xor             rD3,rD3,rI3
+       START_KEY(rD0, rD1, rD2, rD3)
+       bl              ppc_encrypt_block
+       xor             rI0,rD0,rW0
+       SAVE_DATA(rI0, 0)
+       xor             rI1,rD1,rW1
+       SAVE_DATA(rI1, 4)
+       xor             rI2,rD2,rW2
+       SAVE_DATA(rI2, 8)
+       xor             rI3,rD3,rW3
+       SAVE_DATA(rI3, 12)
+       NEXT_BLOCK
+       bt              gt,ppc_encrypt_cbc_loop
+       START_IV
+       SAVE_IV(rI0, 0)
+       SAVE_IV(rI1, 4)
+       SAVE_IV(rI2, 8)
+       SAVE_IV(rI3, 12)
+       FINALIZE_CRYPT(4)
+       blr
+
+/*
+ * ppc_decrypt_cbc(u8 *out, const u8 *in, u32 *key_dec,
+ *                u32 rounds, u32 bytes, u8 *iv);
+ *
+ * called from glue layer to decrypt multiple blocks via CBC
+ * round values are AES128 = 4, AES192 = 5, AES256 = 6
+ *
+ */
+_GLOBAL(ppc_decrypt_cbc)
+       INITIALIZE_CRYPT(PPC_AES_4K_DECTAB, 4)
+       li              rT1,15
+       LOAD_IV(rI0, 0)
+       andc            rLN,rLN,rT1
+       LOAD_IV(rI1, 4)
+       subi            rLN,rLN,16
+       LOAD_IV(rI2, 8)
+       add             rSP,rSP,rLN     /* reverse processing           */
+       LOAD_IV(rI3, 12)
+       add             rDP,rDP,rLN
+       LOAD_DATA(rD0, 0)
+       addi            rT1,rT0,4096
+       LOAD_DATA(rD1, 4)
+       LOAD_DATA(rD2, 8)
+       LOAD_DATA(rD3, 12)
+       START_IV
+       SAVE_IV(rD0, 0)
+       SAVE_IV(rD1, 4)
+       SAVE_IV(rD2, 8)
+       cmpwi           rLN,16
+       SAVE_IV(rD3, 12)
+       bt              lt,ppc_decrypt_cbc_end
+ppc_decrypt_cbc_loop:
+       mr              rKP,rKS
+       START_KEY(rD0, rD1, rD2, rD3)
+       bl              ppc_decrypt_block
+       subi            rLN,rLN,16
+       subi            rSP,rSP,CBC_DEC
+       xor             rW0,rD0,rW0
+       LOAD_DATA(rD0, 0)
+       xor             rW1,rD1,rW1
+       LOAD_DATA(rD1, 4)
+       xor             rW2,rD2,rW2
+       LOAD_DATA(rD2, 8)
+       xor             rW3,rD3,rW3
+       LOAD_DATA(rD3, 12)
+       xor             rW0,rW0,rD0
+       SAVE_DATA(rW0, 0)
+       xor             rW1,rW1,rD1
+       SAVE_DATA(rW1, 4)
+       xor             rW2,rW2,rD2
+       SAVE_DATA(rW2, 8)
+       xor             rW3,rW3,rD3
+       SAVE_DATA(rW3, 12)
+       cmpwi           rLN,15
+       subi            rDP,rDP,CBC_DEC
+       bt              gt,ppc_decrypt_cbc_loop
+ppc_decrypt_cbc_end:
+       mr              rKP,rKS
+       START_KEY(rD0, rD1, rD2, rD3)
+       bl              ppc_decrypt_block
+       xor             rW0,rW0,rD0
+       xor             rW1,rW1,rD1
+       xor             rW2,rW2,rD2
+       xor             rW3,rW3,rD3
+       xor             rW0,rW0,rI0     /* decrypt with initial IV      */
+       SAVE_DATA(rW0, 0)
+       xor             rW1,rW1,rI1
+       SAVE_DATA(rW1, 4)
+       xor             rW2,rW2,rI2
+       SAVE_DATA(rW2, 8)
+       xor             rW3,rW3,rI3
+       SAVE_DATA(rW3, 12)
+       FINALIZE_CRYPT(4)
+       blr
+
+/*
+ * ppc_crypt_ctr(u8 *out, const u8 *in, u32 *key_enc,
+ *              u32 rounds, u32 bytes, u8 *iv);
+ *
+ * called from glue layer to encrypt/decrypt multiple blocks
+ * via CTR. Number of bytes does not need to be a multiple of
+ * 16. Round values are AES128 = 4, AES192 = 5, AES256 = 6
+ *
+ */
+_GLOBAL(ppc_crypt_ctr)
+       INITIALIZE_CRYPT(PPC_AES_4K_ENCTAB, 4)
+       LOAD_IV(rI0, 0)
+       LOAD_IV(rI1, 4)
+       LOAD_IV(rI2, 8)
+       cmpwi           rLN,16
+       LOAD_IV(rI3, 12)
+       START_IV
+       bt              lt,ppc_crypt_ctr_partial
+ppc_crypt_ctr_loop:
+       mr              rKP,rKS
+       START_KEY(rI0, rI1, rI2, rI3)
+       bl              ppc_encrypt_block
+       xor             rW0,rD0,rW0
+       xor             rW1,rD1,rW1
+       xor             rW2,rD2,rW2
+       xor             rW3,rD3,rW3
+       LOAD_DATA(rD0, 0)
+       subi            rLN,rLN,16
+       LOAD_DATA(rD1, 4)
+       LOAD_DATA(rD2, 8)
+       LOAD_DATA(rD3, 12)
+       xor             rD0,rD0,rW0
+       SAVE_DATA(rD0, 0)
+       xor             rD1,rD1,rW1
+       SAVE_DATA(rD1, 4)
+       xor             rD2,rD2,rW2
+       SAVE_DATA(rD2, 8)
+       xor             rD3,rD3,rW3
+       SAVE_DATA(rD3, 12)
+       addic           rI3,rI3,1       /* increase counter                     */
+       addze           rI2,rI2
+       addze           rI1,rI1
+       addze           rI0,rI0
+       NEXT_BLOCK
+       cmpwi           rLN,15
+       bt              gt,ppc_crypt_ctr_loop
+ppc_crypt_ctr_partial:
+       cmpwi           rLN,0
+       bt              eq,ppc_crypt_ctr_end
+       mr              rKP,rKS
+       START_KEY(rI0, rI1, rI2, rI3)
+       bl              ppc_encrypt_block
+       xor             rW0,rD0,rW0
+       SAVE_IV(rW0, 0)
+       xor             rW1,rD1,rW1
+       SAVE_IV(rW1, 4)
+       xor             rW2,rD2,rW2
+       SAVE_IV(rW2, 8)
+       xor             rW3,rD3,rW3
+       SAVE_IV(rW3, 12)
+       mtctr           rLN
+       subi            rIP,rIP,CTR_DEC
+       subi            rSP,rSP,1
+       subi            rDP,rDP,1
+ppc_crypt_ctr_xorbyte:
+       lbzu            rW4,1(rIP)      /* bytewise xor for partial block       */
+       lbzu            rW5,1(rSP)
+       xor             rW4,rW4,rW5
+       stbu            rW4,1(rDP)
+       bdnz            ppc_crypt_ctr_xorbyte
+       subf            rIP,rLN,rIP
+       addi            rIP,rIP,1
+       addic           rI3,rI3,1
+       addze           rI2,rI2
+       addze           rI1,rI1
+       addze           rI0,rI0
+ppc_crypt_ctr_end:
+       SAVE_IV(rI0, 0)
+       SAVE_IV(rI1, 4)
+       SAVE_IV(rI2, 8)
+       SAVE_IV(rI3, 12)
+       FINALIZE_CRYPT(4)
+       blr
+
+/*
+ * ppc_encrypt_xts(u8 *out, const u8 *in, u32 *key_enc,
+ *                u32 rounds, u32 bytes, u8 *iv, u32 *key_twk);
+ *
+ * called from glue layer to encrypt multiple blocks via XTS
+ * If key_twk is given, the initial IV encryption will be
+ * processed too. Round values are AES128 = 4, AES192 = 5,
+ * AES256 = 6
+ *
+ */
+_GLOBAL(ppc_encrypt_xts)
+       INITIALIZE_CRYPT(PPC_AES_4K_ENCTAB, 8)
+       LOAD_IV(rI0, 0)
+       LOAD_IV(rI1, 4)
+       LOAD_IV(rI2, 8)
+       cmpwi           rKT,0
+       LOAD_IV(rI3, 12)
+       bt              eq,ppc_encrypt_xts_notweak
+       mr              rKP,rKT
+       START_KEY(rI0, rI1, rI2, rI3)
+       bl              ppc_encrypt_block
+       xor             rI0,rD0,rW0
+       xor             rI1,rD1,rW1
+       xor             rI2,rD2,rW2
+       xor             rI3,rD3,rW3
+ppc_encrypt_xts_notweak:
+       ENDIAN_SWAP(rG0, rG1, rI0, rI1)
+       ENDIAN_SWAP(rG2, rG3, rI2, rI3)
+ppc_encrypt_xts_loop:
+       LOAD_DATA(rD0, 0)
+       mr              rKP,rKS
+       LOAD_DATA(rD1, 4)
+       subi            rLN,rLN,16
+       LOAD_DATA(rD2, 8)
+       LOAD_DATA(rD3, 12)
+       xor             rD0,rD0,rI0
+       xor             rD1,rD1,rI1
+       xor             rD2,rD2,rI2
+       xor             rD3,rD3,rI3
+       START_KEY(rD0, rD1, rD2, rD3)
+       bl              ppc_encrypt_block
+       xor             rD0,rD0,rW0
+       xor             rD1,rD1,rW1
+       xor             rD2,rD2,rW2
+       xor             rD3,rD3,rW3
+       xor             rD0,rD0,rI0
+       SAVE_DATA(rD0, 0)
+       xor             rD1,rD1,rI1
+       SAVE_DATA(rD1, 4)
+       xor             rD2,rD2,rI2
+       SAVE_DATA(rD2, 8)
+       xor             rD3,rD3,rI3
+       SAVE_DATA(rD3, 12)
+       GF128_MUL(rG0, rG1, rG2, rG3, rW0)
+       ENDIAN_SWAP(rI0, rI1, rG0, rG1)
+       ENDIAN_SWAP(rI2, rI3, rG2, rG3)
+       cmpwi           rLN,0
+       NEXT_BLOCK
+       bt              gt,ppc_encrypt_xts_loop
+       START_IV
+       SAVE_IV(rI0, 0)
+       SAVE_IV(rI1, 4)
+       SAVE_IV(rI2, 8)
+       SAVE_IV(rI3, 12)
+       FINALIZE_CRYPT(8)
+       blr
+
+/*
+ * ppc_decrypt_xts(u8 *out, const u8 *in, u32 *key_dec,
+ *                u32 rounds, u32 blocks, u8 *iv, u32 *key_twk);
+ *
+ * called from glue layer to decrypt multiple blocks via XTS
+ * If key_twk is given, the initial IV encryption will be
+ * processed too. Round values are AES128 = 4, AES192 = 5,
+ * AES256 = 6
+ *
+ */
+_GLOBAL(ppc_decrypt_xts)
+       INITIALIZE_CRYPT(PPC_AES_4K_DECTAB, 8)
+       LOAD_IV(rI0, 0)
+       addi            rT1,rT0,4096
+       LOAD_IV(rI1, 4)
+       LOAD_IV(rI2, 8)
+       cmpwi           rKT,0
+       LOAD_IV(rI3, 12)
+       bt              eq,ppc_decrypt_xts_notweak
+       subi            rT0,rT0,4096
+       mr              rKP,rKT
+       START_KEY(rI0, rI1, rI2, rI3)
+       bl              ppc_encrypt_block
+       xor             rI0,rD0,rW0
+       xor             rI1,rD1,rW1
+       xor             rI2,rD2,rW2
+       xor             rI3,rD3,rW3
+       addi            rT0,rT0,4096
+ppc_decrypt_xts_notweak:
+       ENDIAN_SWAP(rG0, rG1, rI0, rI1)
+       ENDIAN_SWAP(rG2, rG3, rI2, rI3)
+ppc_decrypt_xts_loop:
+       LOAD_DATA(rD0, 0)
+       mr              rKP,rKS
+       LOAD_DATA(rD1, 4)
+       subi            rLN,rLN,16
+       LOAD_DATA(rD2, 8)
+       LOAD_DATA(rD3, 12)
+       xor             rD0,rD0,rI0
+       xor             rD1,rD1,rI1
+       xor             rD2,rD2,rI2
+       xor             rD3,rD3,rI3
+       START_KEY(rD0, rD1, rD2, rD3)
+       bl              ppc_decrypt_block
+       xor             rD0,rD0,rW0
+       xor             rD1,rD1,rW1
+       xor             rD2,rD2,rW2
+       xor             rD3,rD3,rW3
+       xor             rD0,rD0,rI0
+       SAVE_DATA(rD0, 0)
+       xor             rD1,rD1,rI1
+       SAVE_DATA(rD1, 4)
+       xor             rD2,rD2,rI2
+       SAVE_DATA(rD2, 8)
+       xor             rD3,rD3,rI3
+       SAVE_DATA(rD3, 12)
+       GF128_MUL(rG0, rG1, rG2, rG3, rW0)
+       ENDIAN_SWAP(rI0, rI1, rG0, rG1)
+       ENDIAN_SWAP(rI2, rI3, rG2, rG3)
+       cmpwi           rLN,0
+       NEXT_BLOCK
+       bt              gt,ppc_decrypt_xts_loop
+       START_IV
+       SAVE_IV(rI0, 0)
+       SAVE_IV(rI1, 4)
+       SAVE_IV(rI2, 8)
+       SAVE_IV(rI3, 12)
+       FINALIZE_CRYPT(8)
+       blr