C nettle, low-level cryptographics library C C Copyright (C) 2013 Niels Möller C C The nettle library is free software; you can redistribute it and/or modify C it under the terms of the GNU Lesser General Public License as published by C the Free Software Foundation; either version 2.1 of the License, or (at your C option) any later version. C C The nettle library is distributed in the hope that it will be useful, but C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public C License for more details. C C You should have received a copy of the GNU Lesser General Public License C along with the nettle library; see the file COPYING.LIB. If not, write to C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, C MA 02111-1301, USA. define(, <%rdi>) define(, <%rsi>) define(, <%rdx>) define(, <%rcx>) define(, <%r8>) define(, <%xmm0>) define(, <%xmm1>) define(, <%xmm2>) define(, <%xmm3>) define(, <%xmm4>) define(, <%xmm5>) define(, <%xmm6>) define(, <%xmm7>) define(, <%xmm8>) define(, <%xmm9>) define(, <%xmm10>) define(, <%xmm11>) define(, <%xmm12>) define(, <%xmm13>) C Copy [0,1,2,3] to [1,1,3,3] define(, 0xf5,>) C FIXME: Would be nice if we could force the key array to be 16-byte C aligned. .file "umac-nh-n.asm" C umac_nh_n(uint64_t *out, unsigned n, const uint32_t *key, C unsigned length, const uint8_t *msg) .text ALIGN(16) PROLOGUE(_nettle_umac_nh_n) W64_ENTRY(5, 14) pxor XY0, XY0 cmp $3, ITERS jc .Lnh2 je .Lnh3 .Lnh4: movups (KEY), XK0 movups 16(KEY), XT2 movups 32(KEY), XK2 lea 48(KEY), KEY C Leave XK2 untouched, and put permuted keys in XK0, XK1, XT2, XT3 movaps XK0, XT0 movaps XK0, XK1 punpcklqdq XT2, XK0 C [0,1,4,5] punpckhqdq XT2, XK1 C [2,3,6,7] movaps XT2, XT3 punpcklqdq XK2, XT2 C [4,5, 8, 9] punpckhqdq XK2, XT3 C [6,7,10,11] movaps XY0, XY1 .Loop4: movups (MSG), XT0 movups 16(MSG), XT1 pshufd $0xee, XT1, XM3 C [6,7,6,7] pshufd $0x44, XT1, XM2 C [4,5,4,5] pshufd $0xee, XT0, XM1 C [2,3,2,3] pshufd $0x44, XT0, XM0 C [0,1,0,1] paddd XM0, XK0 paddd XM1, XK1 paddd XM2, XT2 paddd XM3, XT3 HI2LO XK0, XT0 HI2LO XT2, XT1 pmuludq XK0, XT2 pmuludq XT0, XT1 paddq XT2, XY0 paddq XT1, XY0 HI2LO XK1, XT0 HI2LO XT3, XT1 pmuludq XK1, XT3 pmuludq XT0, XT1 paddq XT3, XY0 paddq XT1, XY0 movaps XK2, XK0 movaps XK2, XK1 movups (KEY), XT2 movups 16(KEY), XK2 punpcklqdq XT2, XK0 C [ 8, 9,12,13] punpckhqdq XT2, XK1 C [10,11,14,15] movaps XT2, XT3 punpcklqdq XK2, XT2 C [12,13,16,17] punpckhqdq XK2, XT3 C [14,15,18,19] paddd XK0, XM0 paddd XK1, XM1 paddd XT2, XM2 paddd XT3, XM3 HI2LO XM0, XT0 HI2LO XM2, XT1 pmuludq XM0, XM2 pmuludq XT0, XT1 paddq XM2, XY1 paddq XT1, XY1 HI2LO XM1, XT0 HI2LO XM3, XT1 pmuludq XM1, XM3 pmuludq XT0, XT1 paddq XM3, XY1 paddq XT1, XY1 subl $32, XREG(LENGTH) lea 32(MSG), MSG lea 32(KEY), KEY ja .Loop4 movups XY0, (OUT) movups XY1, 16(OUT) W64_EXIT(5, 14) ret .Lnh3: movups (KEY), XK0 movups 16(KEY), XK1 movaps XY0, XY1 .Loop3: lea 32(KEY), KEY movups (MSG), XT0 movups 16(MSG), XT1 movups (KEY), XK2 movups 16(KEY), XK3 pshufd $0xee, XT1, XM3 C [6,7,6,7] pshufd $0x44, XT1, XM2 C [4,5,4,5] pshufd $0xee, XT0, XM1 C [2,3,2,3] pshufd $0x44, XT0, XM0 C [0,1,0,1] C Iteration 2 paddd XK2, XT0 paddd XK3, XT1 HI2LO XT0, XT2 HI2LO XT1, XT3 pmuludq XT0, XT1 pmuludq XT2, XT3 paddq XT1, XY1 paddq XT3, XY1 C Iteration 0,1 movaps XK0, XT0 punpcklqdq XK1, XK0 C [0,1,4,5] punpckhqdq XK1, XT0 C [2,3,6,7] paddd XK0, XM0 paddd XT0, XM1 movaps XK2, XK0 movaps XK1, XT0 punpcklqdq XK2, XK1 C [4,5,8,9] punpckhqdq XK2, XT0 C [6,7,10,11] paddd XK1, XM2 paddd XT0, XM3 HI2LO XM0, XT0 HI2LO XM2, XT1 pmuludq XM0, XM2 pmuludq XT0, XT1 paddq XM2, XY0 paddq XT1, XY0 HI2LO XM1, XT0 HI2LO XM3, XT1 pmuludq XM1, XM3 pmuludq XT0, XT1 paddq XM3, XY0 paddq XT1, XY0 subl $32, XREG(LENGTH) lea 32(MSG), MSG movaps XK2, XK0 movaps XK3, XK1 ja .Loop3 pshufd $0xe, XY1, XT0 paddq XT0, XY1 movups XY0, (OUT) movlpd XY1, 16(OUT) W64_EXIT(5, 14) ret .Lnh2: C Explode message as [0,1,0,1] [2,3,2,3] [4,5,4,5] [6,7, 6, 7] C Interleave keys as [0,1,4,5] [2,3,6,7] [4,5,8,9] [7,8,10,11] movups (KEY), XK0 lea 16(KEY), KEY .Loop2: movups (MSG), XM0 movups 16(MSG), XM1 pshufd $0xee, XM1, XM3 C [6,7,6,7] pshufd $0x44, XM1, XM2 C [4,5,4,5] pshufd $0xee, XM0, XM1 C [2,3,2,3] pshufd $0x44, XM0, XM0 C [0,1,0,1] movups (KEY), XK1 movups 16(KEY), XK2 movaps XK0, XT0 punpcklqdq XK1, XK0 C [0,1,4,5] punpckhqdq XK1, XT0 C [2,3,6,7] paddd XK0, XM0 paddd XT0, XM1 movaps XK2, XK0 movaps XK1, XT0 punpcklqdq XK2, XK1 C [4,5,8,9] punpckhqdq XK2, XT0 C [6,7,10,11] paddd XK1, XM2 paddd XT0, XM3 HI2LO XM0, XT0 HI2LO XM2, XT1 pmuludq XM0, XM2 pmuludq XT0, XT1 paddq XM2, XY0 paddq XT1, XY0 HI2LO XM1, XT0 HI2LO XM3, XT1 pmuludq XM1, XM3 pmuludq XT0, XT1 paddq XM3, XY0 paddq XT1, XY0 subl $32, XREG(LENGTH) lea 32(MSG), MSG lea 32(KEY), KEY ja .Loop2 movups XY0, (OUT) .Lend: W64_EXIT(5, 14) ret EPILOGUE(_nettle_umac_nh_n)