C -*- mode: asm; asm-comment-char: ?C; -*- C nettle, low-level cryptographics library C C Copyright (C) 2002, 2005 Niels Möller C C The nettle library is free software; you can redistribute it and/or modify C it under the terms of the GNU Lesser General Public License as published by C the Free Software Foundation; either version 2.1 of the License, or (at your C option) any later version. C C The nettle library is distributed in the hope that it will be useful, but C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public C License for more details. C C You should have received a copy of the GNU Lesser General Public License C along with the nettle library; see the file COPYING.LIB. If not, write to C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, C MA 02111-1301, USA. C Define to YES, to enable the complex code to special case SRC C and DST with compatible alignment. define(, ) C Registers define(, <%i0>) define(,<%i1>) define(, <%i2>) define(, <%i3>) define(, <%i4>) define(, <%i5>) define(, <%g1>) define(, <%g2>) define(, <%g3>) define(, <%o0>) define(, <%o1>) define(, <%o2>) define(, <%o3>) C Computes the next byte of the key stream. As input, i must C already point to the index for the current access, the index C for the next access is stored in ni. The resulting key byte is C stored in res. C ARCFOUR_BYTE(i, ni, res) define(, < ldub [CTX + $1], SI add $1, 1, $2 add J, SI, J and J, 0xff, J ldub [CTX + J], SJ and $2, 0xff, $2 stb SI, [CTX + J] add SI, SJ, SI and SI, 0xff, SI stb SJ, [CTX + $1] ldub [CTX + SI], $3 >)dnl C FIXME: Consider using the callers window define(, 104) .file "arcfour-crypt.asm" C arcfour_crypt(struct arcfour_ctx *ctx, C unsigned length, uint8_t *dst, C const uint8_t *src) .section ".text" .align 16 .proc 020 PROLOGUE(nettle_arcfour_crypt) save %sp, -FRAME_SIZE, %sp cmp LENGTH, 0 be .Lend nop C Load both I and J lduh [CTX + ARCFOUR_I], I1 and I1, 0xff, J srl I1, 8, I1 C We want an even address for DST andcc DST, 1, %g0 add I1, 1 ,I1 beq .Laligned2 and I1, 0xff, I1 mov I1, I2 ldub [SRC], DATA ARCFOUR_BYTE(I2, I1, TMP) subcc LENGTH, 1, LENGTH add SRC, 1, SRC xor DATA, TMP, DATA stb DATA, [DST] beq .Ldone add DST, 1, DST .Laligned2: cmp LENGTH, 2 blu .Lfinal1 C Harmless delay slot instruction andcc DST, 2, %g0 beq .Laligned4 nop ldub [SRC], DATA ARCFOUR_BYTE(I1, I2, TMP) ldub [SRC + 1], TMP2 add SRC, 2, SRC xor DATA, TMP, DATA sll DATA, 8, DATA ARCFOUR_BYTE(I2, I1, TMP) xor TMP2, TMP, TMP subcc LENGTH, 2, LENGTH or DATA, TMP, DATA sth DATA, [DST] beq .Ldone add DST, 2, DST .Laligned4: cmp LENGTH, 4 blu .Lfinal2 C Harmless delay slot instruction srl LENGTH, 2, N .Loop: C Main loop, with aligned writes C FIXME: Could check if SRC is aligned, and C use 32-bit reads in that case. ldub [SRC], DATA ARCFOUR_BYTE(I1, I2, TMP) ldub [SRC + 1], TMP2 xor TMP, DATA, DATA sll DATA, 8, DATA ARCFOUR_BYTE(I2, I1, TMP) xor TMP2, TMP, TMP ldub [SRC + 2], TMP2 or TMP, DATA, DATA sll DATA, 8, DATA ARCFOUR_BYTE(I1, I2, TMP) xor TMP2, TMP, TMP ldub [SRC + 3], TMP2 or TMP, DATA, DATA sll DATA, 8, DATA ARCFOUR_BYTE(I2, I1, TMP) xor TMP2, TMP, TMP or TMP, DATA, DATA subcc N, 1, N add SRC, 4, SRC st DATA, [DST] bne .Loop add DST, 4, DST andcc LENGTH, 3, LENGTH beq .Ldone nop .Lfinal2: C DST address must be 2-aligned cmp LENGTH, 2 blu .Lfinal1 nop ldub [SRC], DATA ARCFOUR_BYTE(I1, I2, TMP) ldub [SRC + 1], TMP2 add SRC, 2, SRC xor DATA, TMP, DATA sll DATA, 8, DATA ARCFOUR_BYTE(I2, I1, TMP) xor TMP2, TMP, TMP or DATA, TMP, DATA sth DATA, [DST] beq .Ldone add DST, 2, DST .Lfinal1: mov I1, I2 ldub [SRC], DATA ARCFOUR_BYTE(I2, I1, TMP) xor DATA, TMP, DATA stb DATA, [DST] .Ldone: C Save back I and J sll I2, 8, I2 or I2, J, I2 stuh I2, [CTX + ARCFOUR_I] .Lend: ret restore EPILOGUE(nettle_arcfour_crypt) C Some stats from adriana.lysator.liu.se (SS1000E, 85 MHz), for AES 128 C 1: nettle-1.13 C-code C 2: First working version of the assembler code C 3: Moved load of source byte C 4: Better instruction scheduling C 5: Special case SRC and DST with compatible alignment C 6: After bugfix (reorder of ld [CTX+SI+SJ] and st [CTX + SI]) C 7: Unrolled only twice, with byte-accesses C 8: Unrolled, using 8-bit reads and aligned 32-bit writes. C MB/s cycles/byte Code size (bytes) C 1: 6.6 12.4 132 C 2: 5.6 14.5 116 C 3: 6.0 13.5 116 C 4: 6.5 12.4 116 C 5: 7.9 10.4 496 C 6: 8.3 9.7 496 C 7: 6.7 12.1 268 C 8: 8.3 9.8 768