1 C nettle, low-level cryptographics library
3 C Copyright (C) 2011 Niels Möller
5 C The nettle library is free software; you can redistribute it and/or modify
6 C it under the terms of the GNU Lesser General Public License as published by
7 C the Free Software Foundation; either version 2.1 of the License, or (at your
8 C option) any later version.
10 C The nettle library is distributed in the hope that it will be useful, but
11 C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12 C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
13 C License for more details.
15 C You should have received a copy of the GNU Lesser General Public License
16 C along with the nettle library; see the file COPYING.LIB. If not, write to
17 C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
20 include_src(<x86_64/serpent.m4>)
24 C Single block serpent state, two copies
35 C Quadruple block serpent state, two copies
46 define(<MINUS1>, <%xmm8>)
48 define(<T1>, <%xmm10>)
49 define(<T2>, <%xmm11>)
50 define(<T3>, <%xmm12>)
59 define(<TMP32>, <%r14d>)
61 C SBOX macros. Inputs $1 - $4 (destroyed), outputs $5 - $8
64 mov $2, $8 C y3 = x1 ^ x2
66 mov $1, $5 C y0 = x0 | x3
68 mov $1, $6 C y1 = x0 ^ x1
71 mov $3, $7 C y2 = x2 | y3
77 mov $6, $5 C y0 = y1 & x2
85 mov $5, $6 C y1 = y0 ^ x1
91 mov $1, $6 C y1 = x0 | x3
93 mov $3, $7 C y2 = x2 ^ x3
97 mov $1, $8 C y3 = x0 ^ x2
100 and $4, $8 C y3 &= x3
101 mov $6, $1 C x0 = y1 & y2
104 xor $5, $7 C y2 ^= y0
105 xor $1, $8 C y3 ^= x0
106 mov $6, $1 C x0 = y1 ^ y3
108 xor $7, $1 C x0 ^= y2
109 mov $2, $6 C y1 = x1 & x3
111 xor $1, $6 C y1 ^= x0
112 mov $6, $4 C x3 = y1 | y3
115 and $4, $5 C y0 &= x3
116 xor $3, $5 C y0 ^= x2
120 mov $1, $7 C y2 = x1 | x2
300 C Parallel operation on four blocks at a time.
302 C pnot instruction is missing. For lack of a spare register, XOR with
303 C constant in memory.
310 movdqa $2, $8 C y3 = x1 ^ x2
312 movdqa $1, $5 C y0 = x0 | x3
314 movdqa $1, $6 C y1 = x0 ^ x1
316 pxor $5, $8 C y3 ^= y0
317 movdqa $3, $7 C y2 = x2 | y3
319 pxor $4, $1 C x0 ^= x3
320 pand $4, $7 C y2 &= x3
321 pxor $3, $4 C x3 ^= x2
322 por $2, $3 C x2 |= x1
323 movdqa $6, $5 C y0 = y1 & x2
325 pxor $5, $7 C y2 ^= y0
326 pand $7, $5 C y0 &= y2
327 pxor $3, $5 C y0 ^= x2
328 pand $1, $2 C x1 &= x0
329 pxor $1, $5 C y0 ^= x0
331 movdqa $5, $6 C y1 = y0 ^ x1
333 pxor $4, $6 C y1 ^= x3
337 movdqa $1, $6 C y1 = x0 | x3
339 movdqa $3, $7 C y2 = x2 ^ x3
341 movdqa $2, $5 C y0 = ~x1
343 movdqa $1, $8 C y3 = x0 ^ x2
345 por $1, $5 C y0 |= x0
346 pand $4, $8 C y3 &= x3
347 movdqa $6, $1 C x0 = y1 & y2
349 por $2, $8 C y3 |= x1
350 pxor $5, $7 C y2 ^= y0
351 pxor $1, $8 C y3 ^= x0
352 movdqa $6, $1 C x0 = y1 ^ y3
354 pxor $7, $1 C x0 ^= y2
355 movdqa $2, $6 C y1 = x1 & x3
357 pxor $1, $6 C y1 ^= x0
358 movdqa $6, $4 C x3 = y1 | y3
361 pand $4, $5 C y0 &= x3
362 pxor $3, $5 C y0 ^= x2
366 movdqa $1, $7 C y2 = x1 | x2
502 pandn $1, $6 C t02 implicit
524 C WLT(x0, x1, x2, x3)
546 .file "serpent-encrypt.asm"
548 C serpent_encrypt(struct serpent_context *ctx,
549 C unsigned length, uint8_t *dst,
550 C const uint8_t *src)
553 PROLOGUE(nettle_serpent_encrypt)
554 C save all registers that need to be saved
567 C Point at the final subkey.
573 pcmpeqd MINUS1, MINUS1
577 movups 16(SRC, N), X1
578 movups 32(SRC, N), X2
579 movups 48(SRC, N), X3
581 WTRANSPOSE(X0, X1, X2, X3)
590 WKEYXOR(, X0,X1,X2,X3)
591 WSBOX0(X0,X1,X2,X3, Y0,Y1,Y2,Y3)
594 WKEYXOR(16, Y0,Y1,Y2,Y3)
595 WSBOX1(Y0,Y1,Y2,Y3, X0,X1,X2,X3)
598 WKEYXOR(32, X0,X1,X2,X3)
599 WSBOX2(X0,X1,X2,X3, Y0,Y1,Y2,Y3)
602 WKEYXOR(48, Y0,Y1,Y2,Y3)
603 WSBOX3(Y0,Y1,Y2,Y3, X0,X1,X2,X3)
606 WKEYXOR(64, X0,X1,X2,X3)
607 WSBOX4(X0,X1,X2,X3, Y0,Y1,Y2,Y3)
610 WKEYXOR(80, Y0,Y1,Y2,Y3)
611 WSBOX5(Y0,Y1,Y2,Y3, X0,X1,X2,X3)
614 WKEYXOR(96, X0,X1,X2,X3)
615 WSBOX6(X0,X1,X2,X3, Y0,Y1,Y2,Y3)
618 WKEYXOR(112, Y0,Y1,Y2,Y3)
619 WSBOX7(Y0,Y1,Y2,Y3, X0,X1,X2,X3)
623 C FIXME: CNT known to be zero, no index register needed
624 WKEYXOR(, X0,X1,X2,X3)
626 WTRANSPOSE(X0,X1,X2,X3)
629 movups X1, 16(DST, N)
630 movups X2, 32(DST, N)
631 movups X3, 48(DST, N)
633 C FIXME: Adjust N, so we can use just jnc without an extra cmp.
640 C The single-block loop here is slightly slower than the double-block
641 C loop in serpent-encrypt.c.
643 C FIXME: Should use non-sse2 code only if we have a single block left.
644 C With two or three blocks, it should be better to do them in
664 SBOX0(x0,x1,x2,x3, y0,y1,y2,y3)
671 SBOX1(y0,y1,y2,y3, x0,x1,x2,x3)
678 SBOX2(x0,x1,x2,x3, y0,y1,y2,y3)
685 SBOX3(y0,y1,y2,y3, x0,x1,x2,x3)
692 SBOX4(x0,x1,x2,x3, y0,y1,y2,y3)
699 SBOX5(y0,y1,y2,y3, x0,x1,x2,x3)
703 xor 100(CTX, CNT), x1
704 xor 104(CTX, CNT), x2
705 xor 108(CTX, CNT), x3
706 SBOX6(x0,x1,x2,x3, y0,y1,y2,y3)
709 xor 112(CTX, CNT), y0
710 xor 116(CTX, CNT), y1
711 xor 120(CTX, CNT), y2
712 xor 124(CTX, CNT), y3
713 SBOX7(y0,y1,y2,y3, x0,x1,x2,x3)
717 C Apply final subkey.