1 C x86_64/serpent-encrypt.asm
4 Copyright (C) 2011 Niels Möller
6 This file is part of GNU Nettle.
8 GNU Nettle is free software: you can redistribute it and/or
9 modify it under the terms of either:
11 * the GNU Lesser General Public License as published by the Free
12 Software Foundation; either version 3 of the License, or (at your
13 option) any later version.
17 * the GNU General Public License as published by the Free
18 Software Foundation; either version 2 of the License, or (at your
19 option) any later version.
21 or both in parallel, as here.
23 GNU Nettle is distributed in the hope that it will be useful,
24 but WITHOUT ANY WARRANTY; without even the implied warranty of
25 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
26 General Public License for more details.
28 You should have received copies of the GNU General Public License and
29 the GNU Lesser General Public License along with this program. If
30 not, see http://www.gnu.org/licenses/.
33 include_src(<x86_64/serpent.m4>)
37 C Single block serpent state, two copies
48 C Quadruple block serpent state, two copies
59 define(<MINUS1>, <%xmm8>)
61 define(<T1>, <%xmm10>)
62 define(<T2>, <%xmm11>)
63 define(<T3>, <%xmm12>)
72 define(<TMP32>, <%r14d>)
74 C SBOX macros. Inputs $1 - $4 (destroyed), outputs $5 - $8
77 mov $2, $8 C y3 = x1 ^ x2
79 mov $1, $5 C y0 = x0 | x3
81 mov $1, $6 C y1 = x0 ^ x1
84 mov $3, $7 C y2 = x2 | y3
90 mov $6, $5 C y0 = y1 & x2
98 mov $5, $6 C y1 = y0 ^ x1
100 xor $4, $6 C y1 ^= x3
104 mov $1, $6 C y1 = x0 | x3
106 mov $3, $7 C y2 = x2 ^ x3
108 mov $2, $5 C y0 = ~x1
110 mov $1, $8 C y3 = x0 ^ x2
113 and $4, $8 C y3 &= x3
114 mov $6, $1 C x0 = y1 & y2
117 xor $5, $7 C y2 ^= y0
118 xor $1, $8 C y3 ^= x0
119 mov $6, $1 C x0 = y1 ^ y3
121 xor $7, $1 C x0 ^= y2
122 mov $2, $6 C y1 = x1 & x3
124 xor $1, $6 C y1 ^= x0
125 mov $6, $4 C x3 = y1 | y3
128 and $4, $5 C y0 &= x3
129 xor $3, $5 C y0 ^= x2
133 mov $1, $7 C y2 = x1 | x2
313 C Parallel operation on four blocks at a time.
315 C pnot instruction is missing. For lack of a spare register, XOR with
316 C constant in memory.
323 movdqa $2, $8 C y3 = x1 ^ x2
325 movdqa $1, $5 C y0 = x0 | x3
327 movdqa $1, $6 C y1 = x0 ^ x1
329 pxor $5, $8 C y3 ^= y0
330 movdqa $3, $7 C y2 = x2 | y3
332 pxor $4, $1 C x0 ^= x3
333 pand $4, $7 C y2 &= x3
334 pxor $3, $4 C x3 ^= x2
335 por $2, $3 C x2 |= x1
336 movdqa $6, $5 C y0 = y1 & x2
338 pxor $5, $7 C y2 ^= y0
339 pand $7, $5 C y0 &= y2
340 pxor $3, $5 C y0 ^= x2
341 pand $1, $2 C x1 &= x0
342 pxor $1, $5 C y0 ^= x0
344 movdqa $5, $6 C y1 = y0 ^ x1
346 pxor $4, $6 C y1 ^= x3
350 movdqa $1, $6 C y1 = x0 | x3
352 movdqa $3, $7 C y2 = x2 ^ x3
354 movdqa $2, $5 C y0 = ~x1
356 movdqa $1, $8 C y3 = x0 ^ x2
358 por $1, $5 C y0 |= x0
359 pand $4, $8 C y3 &= x3
360 movdqa $6, $1 C x0 = y1 & y2
362 por $2, $8 C y3 |= x1
363 pxor $5, $7 C y2 ^= y0
364 pxor $1, $8 C y3 ^= x0
365 movdqa $6, $1 C x0 = y1 ^ y3
367 pxor $7, $1 C x0 ^= y2
368 movdqa $2, $6 C y1 = x1 & x3
370 pxor $1, $6 C y1 ^= x0
371 movdqa $6, $4 C x3 = y1 | y3
374 pand $4, $5 C y0 &= x3
375 pxor $3, $5 C y0 ^= x2
379 movdqa $1, $7 C y2 = x1 | x2
515 pandn $1, $6 C t02 implicit
537 C WLT(x0, x1, x2, x3)
559 .file "serpent-encrypt.asm"
561 C serpent_encrypt(struct serpent_context *ctx,
562 C size_t length, uint8_t *dst,
563 C const uint8_t *src)
566 PROLOGUE(nettle_serpent_encrypt)
567 C save all registers that need to be saved
580 C Point at the final subkey.
586 pcmpeqd MINUS1, MINUS1
590 movups 16(SRC, N), X1
591 movups 32(SRC, N), X2
592 movups 48(SRC, N), X3
594 WTRANSPOSE(X0, X1, X2, X3)
603 WKEYXOR(, X0,X1,X2,X3)
604 WSBOX0(X0,X1,X2,X3, Y0,Y1,Y2,Y3)
607 WKEYXOR(16, Y0,Y1,Y2,Y3)
608 WSBOX1(Y0,Y1,Y2,Y3, X0,X1,X2,X3)
611 WKEYXOR(32, X0,X1,X2,X3)
612 WSBOX2(X0,X1,X2,X3, Y0,Y1,Y2,Y3)
615 WKEYXOR(48, Y0,Y1,Y2,Y3)
616 WSBOX3(Y0,Y1,Y2,Y3, X0,X1,X2,X3)
619 WKEYXOR(64, X0,X1,X2,X3)
620 WSBOX4(X0,X1,X2,X3, Y0,Y1,Y2,Y3)
623 WKEYXOR(80, Y0,Y1,Y2,Y3)
624 WSBOX5(Y0,Y1,Y2,Y3, X0,X1,X2,X3)
627 WKEYXOR(96, X0,X1,X2,X3)
628 WSBOX6(X0,X1,X2,X3, Y0,Y1,Y2,Y3)
631 WKEYXOR(112, Y0,Y1,Y2,Y3)
632 WSBOX7(Y0,Y1,Y2,Y3, X0,X1,X2,X3)
636 C FIXME: CNT known to be zero, no index register needed
637 WKEYXOR(, X0,X1,X2,X3)
639 WTRANSPOSE(X0,X1,X2,X3)
642 movups X1, 16(DST, N)
643 movups X2, 32(DST, N)
644 movups X3, 48(DST, N)
646 C FIXME: Adjust N, so we can use just jnc without an extra cmp.
653 C The single-block loop here is slightly slower than the double-block
654 C loop in serpent-encrypt.c.
656 C FIXME: Should use non-sse2 code only if we have a single block left.
657 C With two or three blocks, it should be better to do them in
677 SBOX0(x0,x1,x2,x3, y0,y1,y2,y3)
684 SBOX1(y0,y1,y2,y3, x0,x1,x2,x3)
691 SBOX2(x0,x1,x2,x3, y0,y1,y2,y3)
698 SBOX3(y0,y1,y2,y3, x0,x1,x2,x3)
705 SBOX4(x0,x1,x2,x3, y0,y1,y2,y3)
712 SBOX5(y0,y1,y2,y3, x0,x1,x2,x3)
716 xor 100(CTX, CNT), x1
717 xor 104(CTX, CNT), x2
718 xor 108(CTX, CNT), x3
719 SBOX6(x0,x1,x2,x3, y0,y1,y2,y3)
722 xor 112(CTX, CNT), y0
723 xor 116(CTX, CNT), y1
724 xor 120(CTX, CNT), y2
725 xor 124(CTX, CNT), y3
726 SBOX7(y0,y1,y2,y3, x0,x1,x2,x3)
730 C Apply final subkey.