; AesOpt.asm -- Intel's AES. ; 2009-12-12 : Igor Pavlov : Public domain include 7zAsm.asm MY_ASM_START ifndef x64 .xmm endif ifdef x64 num equ r8 else num equ [r4 + REG_SIZE * 4] endif rD equ r2 rN equ r0 MY_PROLOG macro reg:req ifdef x64 movdqa [r4 + 8], xmm6 movdqa [r4 + 8 + 16], xmm7 endif push r3 push r5 push r6 mov rN, num mov x6, [r1 + 16] shl x6, 5 movdqa reg, [r1] add r1, 32 endm MY_EPILOG macro pop r6 pop r5 pop r3 ifdef x64 movdqa xmm6, [r4 + 8] movdqa xmm7, [r4 + 8 + 16] endif MY_ENDP endm ways equ 4 ways16 equ (ways * 16) OP_W macro op, op2 i = 0 rept ways op @CatStr(xmm,%i), op2 i = i + 1 endm endm LOAD_OP macro op:req, offs:req op xmm0, [r1 + r3 offs] endm LOAD_OP_W macro op:req, offs:req movdqa xmm7, [r1 + r3 offs] OP_W op, xmm7 endm ; ---------- AES-CBC Decode ---------- CBC_DEC_UPDATE macro reg, offs pxor reg, xmm6 movdqa xmm6, [rD + offs] movdqa [rD + offs], reg endm DECODE macro op:req op aesdec, +16 @@: op aesdec, +0 op aesdec, -16 sub x3, 32 jnz @B op aesdeclast, +0 endm MY_PROC AesCbc_Decode_Intel, 3 MY_PROLOG xmm6 sub x6, 32 jmp check2 align 16 nextBlocks2: mov x3, x6 OP_W movdqa, [rD + i * 16] LOAD_OP_W pxor, +32 DECODE LOAD_OP_W OP_W CBC_DEC_UPDATE, i * 16 add rD, ways16 check2: sub rN, ways jnc nextBlocks2 add rN, ways jmp check nextBlock: mov x3, x6 movdqa xmm1, [rD] LOAD_OP movdqa, +32 pxor xmm0, xmm1 DECODE LOAD_OP pxor xmm0, xmm6 movdqa [rD], xmm0 movdqa xmm6, xmm1 add rD, 16 check: sub rN, 1 jnc nextBlock movdqa [r1 - 32], xmm6 MY_EPILOG ; ---------- AES-CBC Encode ---------- ENCODE macro op:req op aesenc, -16 @@: op aesenc, +0 op aesenc, +16 add r3, 32 jnz @B op aesenclast, +0 endm MY_PROC AesCbc_Encode_Intel, 3 MY_PROLOG xmm0 add r1, r6 neg r6 add r6, 32 jmp check_e align 16 nextBlock_e: mov r3, r6 pxor xmm0, [rD] pxor xmm0, [r1 + r3 - 32] ENCODE LOAD_OP movdqa [rD], xmm0 add rD, 16 check_e: sub rN, 1 jnc nextBlock_e movdqa [r1 + r6 - 64], xmm0 MY_EPILOG ; ---------- AES-CTR ---------- XOR_UPD_1 macro reg, offs pxor reg, [rD + offs] endm XOR_UPD_2 macro reg, offs movdqa [rD + offs], reg endm MY_PROC AesCtr_Code_Intel, 3 MY_PROLOG xmm6 mov r5, r4 shr r5, 4 dec r5 shl r5, 4 mov DWORD PTR [r5], 1 mov DWORD PTR [r5 + 4], 0 mov DWORD PTR [r5 + 8], 0 mov DWORD PTR [r5 + 12], 0 add r1, r6 neg r6 add r6, 32 jmp check2_c align 16 nextBlocks2_c: movdqa xmm7, [r5] i = 0 rept ways paddq xmm6, xmm7 movdqa @CatStr(xmm,%i), xmm6 i = i + 1 endm mov r3, r6 LOAD_OP_W pxor, -32 ENCODE LOAD_OP_W OP_W XOR_UPD_1, i * 16 OP_W XOR_UPD_2, i * 16 add rD, ways16 check2_c: sub rN, ways jnc nextBlocks2_c add rN, ways jmp check_c nextBlock_c: paddq xmm6, [r5] mov r3, r6 movdqa xmm0, [r1 + r3 - 32] pxor xmm0, xmm6 ENCODE LOAD_OP XOR_UPD_1 xmm0, 0 XOR_UPD_2 xmm0, 0 add rD, 16 check_c: sub rN, 1 jnc nextBlock_c movdqa [r1 + r6 - 64], xmm6 MY_EPILOG end