C nettle, low-level cryptographics library C C Copyright (C) 2011 Niels Möller C C The nettle library is free software; you can redistribute it and/or modify C it under the terms of the GNU Lesser General Public License as published by C the Free Software Foundation; either version 2.1 of the License, or (at your C option) any later version. C C The nettle library is distributed in the hope that it will be useful, but C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public C License for more details. C C You should have received a copy of the GNU Lesser General Public License C along with the nettle library; see the file COPYING.LIB. If not, write to C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, C MA 02111-1301, USA. include_src() C Register usage: C Single block serpent state, two copies define(, <%eax>) define(, <%ebx>) define(, <%ebp>) define(, <%r8d>) define(, <%r9d>) define(, <%r10d>) define(, <%r11d>) define(, <%r12d>) C Quadruple block serpent state, two copies define(, <%xmm0>) define(, <%xmm1>) define(, <%xmm2>) define(, <%xmm3>) define(, <%xmm4>) define(, <%xmm5>) define(, <%xmm6>) define(, <%xmm7>) define(, <%xmm8>) define(, <%xmm9>) define(, <%xmm10>) define(, <%xmm11>) define(, <%xmm12>) C Arguments define(, <%rdi>) define(, <%rsi>) define(, <%rdx>) define(, <%rcx>) define(, <%r13>) define(, <%r14d>) C SBOX macros. Inputs $1 - $4 (destroyed), outputs $5 - $8 define(, < mov $1, $5 xor $3, $5 mov $1, $7 or $2, $7 mov $3, $6 xor $4, $6 xor $6, $7 and $3, $6 or $2, $3 xor $4, $2 or $1, $6 and $3, $2 xor $2, $6 or $7, $1 xor $6, $1 mov $7, $2 and $1, $2 not $7 or $7, $4 xor $3, $4 mov $1, $8 xor $4, $8 or $4, $2 xor $2, $5 >) define(, < mov $2, $6 or $4, $6 xor $3, $6 mov $1, $8 xor $2, $8 mov $1, $5 or $6, $5 and $8, $5 xor $5, $2 xor $6, $8 and $4, $2 mov $1, $7 and $3, $7 or $7, $6 or $4, $7 xor $5, $7 not $7 xor $2, $6 xor $6, $5 xor $3, $5 or $7, $1 xor $1, $5 >) define(, < mov $1, $5 xor $4, $5 mov $3, $7 xor $4, $7 mov $2, $6 or $7, $6 xor $6, $5 mov $4, $6 or $5, $6 and $2, $6 not $4 mov $1, $8 or $3, $8 and $8, $7 xor $7, $6 and $2, $8 and $3, $1 or $4, $1 xor $1, $8 and $8, $3 xor $1, $3 mov $5, $7 xor $6, $7 xor $3, $7 >) define(, < mov $3, $8 or $4, $8 mov $2, $5 and $8, $5 mov $1, $7 or $4, $7 mov $3, $6 xor $7, $6 xor $6, $5 xor $1, $4 xor $4, $8 xor $2, $7 and $6, $7 xor $4, $7 xor $1, $6 or $5, $4 and $4, $6 xor $2, $6 and $7, $1 or $2, $1 xor $1, $8 >) define(, < mov $3, $6 xor $4, $6 mov $3, $7 or $4, $7 xor $2, $7 or $4, $2 mov $1, $5 xor $7, $5 xor $7, $4 and $1, $7 xor $7, $6 xor $1, $7 or $3, $7 and $2, $1 mov $1, $8 xor $4, $8 not $1 or $6, $1 xor $1, $5 xor $2, $1 xor $1, $7 >) define(, < mov $1, $6 and $4, $6 mov $3, $8 xor $6, $8 mov $2, $5 and $8, $5 mov $1, $7 xor $4, $7 xor $2, $4 xor $7, $5 and $1, $3 and $5, $1 or $2, $3 xor $5, $6 xor $3, $6 mov $5, $7 or $6, $7 xor $8, $7 xor $4, $7 not $2 or $1, $2 xor $2, $8 >) define(, < mov $1, $7 xor $3, $7 not $3 mov $2, $5 xor $4, $5 mov $1, $6 or $3, $6 xor $5, $6 mov $2, $8 and $7, $8 or $4, $8 or $3, $4 or $2, $3 and $1, $3 mov $3, $5 xor $8, $5 not $5 and $7, $8 xor $3, $8 xor $6, $1 xor $1, $8 and $5, $2 xor $2, $7 xor $4, $7 >) define(, < mov $1, $8 and $2, $8 mov $2, $7 xor $4, $7 or $8, $7 mov $1, $6 or $4, $6 and $3, $6 xor $6, $7 or $3, $8 mov $1, $5 or $2, $5 and $4, $5 xor $5, $8 xor $2, $5 mov $4, $6 xor $8, $6 not $6 or $5, $6 xor $3, $5 xor $1, $6 or $6, $4 xor $4, $5 >) define(, < rol <$>10, $3 rol <$>27, $1 mov $2, TMP32 shl <$>7, TMP32 xor $4, $3 xor TMP32, $3 xor $2, $1 xor $4, $1 rol <$>25, $4 rol <$>31, $2 mov $1, TMP32 shl <$>3, TMP32 xor $3, $4 xor TMP32, $4 xor $1, $2 xor $3, $2 rol <$>29, $3 rol <$>19, $1 >) define(, < pxor MINUS1, $1 >) define(, < movdqa $1, $5 pxor $3, $5 movdqa $1, $7 por $2, $7 movdqa $3, $6 pxor $4, $6 pxor $6, $7 pand $3, $6 por $2, $3 pxor $4, $2 por $1, $6 pand $3, $2 pxor $2, $6 por $7, $1 pxor $6, $1 movdqa $7, $2 pand $1, $2 PNOT($7) por $7, $4 pxor $3, $4 movdqa $1, $8 pxor $4, $8 por $4, $2 pxor $2, $5 >) define(, < movdqa $2, $6 por $4, $6 pxor $3, $6 movdqa $1, $8 pxor $2, $8 movdqa $1, $5 por $6, $5 pand $8, $5 pxor $5, $2 pxor $6, $8 pand $4, $2 movdqa $1, $7 pand $3, $7 por $7, $6 por $4, $7 pxor $5, $7 PNOT($7) pxor $2, $6 pxor $6, $5 pxor $3, $5 por $7, $1 pxor $1, $5 >) define(, < movdqa $1, $5 pxor $4, $5 movdqa $3, $7 pxor $4, $7 movdqa $2, $6 por $7, $6 pxor $6, $5 movdqa $4, $6 por $5, $6 pand $2, $6 PNOT($4) movdqa $1, $8 por $3, $8 pand $8, $7 pxor $7, $6 pand $2, $8 pand $3, $1 por $4, $1 pxor $1, $8 pand $8, $3 pxor $1, $3 movdqa $5, $7 pxor $6, $7 pxor $3, $7 >) define(, < movdqa $3, $8 por $4, $8 movdqa $2, $5 pand $8, $5 movdqa $1, $7 por $4, $7 movdqa $3, $6 pxor $7, $6 pxor $6, $5 pxor $1, $4 pxor $4, $8 pxor $2, $7 pand $6, $7 pxor $4, $7 pxor $1, $6 por $5, $4 pand $4, $6 pxor $2, $6 pand $7, $1 por $2, $1 pxor $1, $8 >) define(, < movdqa $3, $6 pxor $4, $6 movdqa $3, $7 por $4, $7 pxor $2, $7 por $4, $2 movdqa $1, $5 pxor $7, $5 pxor $7, $4 pand $1, $7 pxor $7, $6 pxor $1, $7 por $3, $7 pand $2, $1 movdqa $1, $8 pxor $4, $8 PNOT($1) por $6, $1 pxor $1, $5 pxor $2, $1 pxor $1, $7 >) define(, < movdqa $1, $6 pand $4, $6 movdqa $3, $8 pxor $6, $8 movdqa $2, $5 pand $8, $5 movdqa $1, $7 pxor $4, $7 pxor $2, $4 pxor $7, $5 pand $1, $3 pand $5, $1 por $2, $3 pxor $5, $6 pxor $3, $6 movdqa $5, $7 por $6, $7 pxor $8, $7 pxor $4, $7 PNOT($2) por $1, $2 pxor $2, $8 >) define(, < movdqa $1, $7 pxor $3, $7 PNOT($3) movdqa $2, $5 pxor $4, $5 movdqa $1, $6 por $3, $6 pxor $5, $6 movdqa $2, $8 pand $7, $8 por $4, $8 por $3, $4 por $2, $3 pand $1, $3 movdqa $3, $5 pxor $8, $5 PNOT($5) pand $7, $8 pxor $3, $8 pxor $6, $1 pxor $1, $8 pand $5, $2 pxor $2, $7 pxor $4, $7 >) define(, < movdqa $1, $8 pand $2, $8 movdqa $2, $7 pxor $4, $7 por $8, $7 movdqa $1, $6 por $4, $6 pand $3, $6 pxor $6, $7 por $3, $8 movdqa $1, $5 por $2, $5 pand $4, $5 pxor $5, $8 pxor $2, $5 movdqa $4, $6 pxor $8, $6 PNOT($6) por $5, $6 pxor $3, $5 pxor $1, $6 por $6, $4 pxor $4, $5 >) define(, < WROL(10, $3) WROL(27, $1) movdqa $2, T0 pslld <$>7, T0 pxor $4, $3 pxor T0, $3 pxor $2, $1 pxor $4, $1 WROL(25, $4) WROL(31, $2) movdqa $1, T0 pslld <$>3, T0 pxor $3, $4 pxor T0, $4 pxor $1, $2 pxor $3, $2 WROL(29, $3) WROL(19, $1) >) .file "serpent-decrypt.asm" C serpent_decrypt(struct serpent_context *ctx, C unsigned length, uint8_t *dst, C const uint8_t *src) .text ALIGN(16) PROLOGUE(nettle_serpent_decrypt) C save all registers that need to be saved W64_ENTRY(4, 13) push %rbx push %rbp push %r12 push %r13 push %r14 lea (SRC, N), SRC lea (DST, N), DST neg N jz .Lend cmp $-64, N ja .Lblock_loop pcmpeqd MINUS1, MINUS1 .Lwblock_loop: movups (SRC, N), X0 movups 16(SRC, N), X1 movups 32(SRC, N), X2 movups 48(SRC, N), X3 WTRANSPOSE(X0,X1,X2,X3) mov $384, CNT C FIXME: CNT known, no index register needed WKEYXOR(128, X0,X1,X2,X3) jmp .Lwround_start ALIGN(16) .Lwround_loop: WLTI(X0,X1,X2,X3) .Lwround_start: WSBOX7I(X0,X1,X2,X3, Y0,Y1,Y2,Y3) WKEYXOR(112, Y0,Y1,Y2,Y3) WLTI(Y0,Y1,Y2,Y3) WSBOX6I(Y0,Y1,Y2,Y3, X0,X1,X2,X3) WKEYXOR(96, X0,X1,X2,X3) WLTI(X0,X1,X2,X3) WSBOX5I(X0,X1,X2,X3, Y0,Y1,Y2,Y3) WKEYXOR(80, Y0,Y1,Y2,Y3) WLTI(Y0,Y1,Y2,Y3) WSBOX4I(Y0,Y1,Y2,Y3, X0,X1,X2,X3) WKEYXOR(64, X0,X1,X2,X3) WLTI(X0,X1,X2,X3) WSBOX3I(X0,X1,X2,X3, Y0,Y1,Y2,Y3) WKEYXOR(48, Y0,Y1,Y2,Y3) WLTI(Y0,Y1,Y2,Y3) WSBOX2I(Y0,Y1,Y2,Y3, X0,X1,X2,X3) WKEYXOR(32, X0,X1,X2,X3) WLTI(X0,X1,X2,X3) WSBOX1I(X0,X1,X2,X3, Y0,Y1,Y2,Y3) WKEYXOR(16, Y0,Y1,Y2,Y3) WLTI(Y0,Y1,Y2,Y3) WSBOX0I(Y0,Y1,Y2,Y3, X0,X1,X2,X3) WKEYXOR(, X0,X1,X2,X3) sub $128, CNT jnc .Lwround_loop WTRANSPOSE(X0,X1,X2,X3) movups X0, (DST, N) movups X1, 16(DST, N) movups X2, 32(DST, N) movups X3, 48(DST, N) C FIXME: Adjust N, so we can use just jnc without an extra cmp. add $64, N jz .Lend cmp $-64, N jbe .Lwblock_loop .Lblock_loop: movl (SRC, N), x0 movl 4(SRC, N), x1 movl 8(SRC, N), x2 movl 12(SRC, N), x3 xor 512(CTX), x0 xor 516(CTX), x1 xor 520(CTX), x2 xor 524(CTX), x3 mov $384, CNT jmp .Lround_start ALIGN(16) .Lround_loop: LTI(x0,x1,x2,x3) .Lround_start: SBOX7I(x0,x1,x2,x3, y0,y1,y2,y3) xor 112(CTX, CNT), y0 xor 116(CTX, CNT), y1 xor 120(CTX, CNT), y2 xor 124(CTX, CNT), y3 LTI(y0,y1,y2,y3) SBOX6I(y0,y1,y2,y3, x0,x1,x2,x3) xor 96(CTX, CNT), x0 xor 100(CTX, CNT), x1 xor 104(CTX, CNT), x2 xor 108(CTX, CNT), x3 LTI(x0,x1,x2,x3) SBOX5I(x0,x1,x2,x3, y0,y1,y2,y3) xor 80(CTX, CNT), y0 xor 84(CTX, CNT), y1 xor 88(CTX, CNT), y2 xor 92(CTX, CNT), y3 LTI(y0,y1,y2,y3) SBOX4I(y0,y1,y2,y3, x0,x1,x2,x3) xor 64(CTX, CNT), x0 xor 68(CTX, CNT), x1 xor 72(CTX, CNT), x2 xor 76(CTX, CNT), x3 LTI(x0,x1,x2,x3) SBOX3I(x0,x1,x2,x3, y0,y1,y2,y3) xor 48(CTX, CNT), y0 xor 52(CTX, CNT), y1 xor 56(CTX, CNT), y2 xor 60(CTX, CNT), y3 LTI(y0,y1,y2,y3) SBOX2I(y0,y1,y2,y3, x0,x1,x2,x3) xor 32(CTX, CNT), x0 xor 36(CTX, CNT), x1 xor 40(CTX, CNT), x2 xor 44(CTX, CNT), x3 LTI(x0,x1,x2,x3) SBOX1I(x0,x1,x2,x3, y0,y1,y2,y3) xor 16(CTX, CNT), y0 xor 20(CTX, CNT), y1 xor 24(CTX, CNT), y2 xor 28(CTX, CNT), y3 LTI(y0,y1,y2,y3) SBOX0I(y0,y1,y2,y3, x0,x1,x2,x3) xor (CTX, CNT), x0 xor 4(CTX, CNT), x1 xor 8(CTX, CNT), x2 xor 12(CTX, CNT), x3 sub $128, CNT jnc .Lround_loop movl x0, (DST, N) movl x1, 4(DST, N) movl x2, 8(DST, N) movl x3, 12(DST, N) add $16, N jnc .Lblock_loop .Lend: pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx W64_EXIT(4, 13) ret