1 ; ***** BEGIN LICENSE BLOCK *****
2 ; Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 ; The contents of this file are subject to the Mozilla Public License Version
5 ; 1.1 (the "License"); you may not use this file except in compliance with
6 ; the License. You may obtain a copy of the License at
7 ; http://www.mozilla.org/MPL/
9 ; Software distributed under the License is distributed on an "AS IS" basis,
10 ; WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 ; for the specific language governing rights and limitations under the
14 ; The Original Code is "Marc Bevand's fast AMD64 ARCFOUR source"
16 ; The Initial Developer of the Original Code is
17 ; Marc Bevand <bevand_m@epita.fr> .
18 ; Portions created by the Initial Developer are
19 ; Copyright (C) 2004 the Initial Developer. All Rights Reserved.
21 ; Contributor(s): Makoto Kato (m_kato@ga2.so-net.ne.jp)
23 ; Alternatively, the contents of this file may be used under the terms of
24 ; either the GNU General Public License Version 2 or later (the "GPL"), or
25 ; the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 ; in which case the provisions of the GPL or the LGPL are applicable instead
27 ; of those above. If you wish to allow use of your version of this file only
28 ; under the terms of either the GPL or the LGPL, and not to allow others to
29 ; use your version of this file under the terms of the MPL, indicate your
30 ; decision by deleting the provisions above and replace them with the notice
31 ; and other provisions required by the GPL or the LGPL. If you do not delete
32 ; the provisions above, a recipient may use your version of this file under
33 ; the terms of any one of the MPL, the GPL or the LGPL.
35 ; ***** END LICENSE BLOCK *****
37 ; ** ARCFOUR implementation optimized for AMD64.
39 ; ** The throughput achieved by this code is about 320 MBytes/sec, on
40 ; ** a 1.8 GHz AMD Opteron (rev C0) processor.
44 ; extern void ARCFOUR(RC4Context *cx, unsigned long long inputLen,
45 ; const unsigned char *input, unsigned char *output);
55 mov rbp, rcx ; key = ARG(key)
56 mov rbx, rdx ; rbx = ARG(len)
57 mov rsi, r8 ; in = ARG(in)
58 mov rdi, r9 ; out = ARG(out)
59 mov rcx, [rbp] ; x = key->x
60 mov rdx, [rbp+8] ; y = key->y
61 add rbp, 16 ; d = key->data
63 and rcx, 0ffh ; x &= 0xff
64 lea rbx, [rbx+rsi-8] ; rbx = in+len-8
65 mov r9, rbx ; tmp = in+len-8
66 mov rax, [rbp+rcx*8] ; tx = d[x]
67 cmp rbx, rsi ; cmp in with in+len-8
68 jl Lend ; jump if (in+len-8 < in)
71 add rsi, 8 ; increment in
72 add rdi, 8 ; increment out
75 ; generate the next 8 bytes of the rc4 stream into r8
78 mov r11, 8 ; byte counter
82 mov ebx, [rbp+rdx*8] ; ty = d[y]
83 mov [rbp+rcx*8], ebx ; d[x] = ty
84 add bl, al ; val = ty + tx
85 mov [rbp+rdx*8], eax ; d[y] = tx
86 inc cl ; x++ (NEXT ROUND)
87 mov eax, [rbp+rcx*8] ; tx = d[x] (NEXT ROUND)
88 mov r8b, [rbp+rbx*8] ; val = d[val]
90 ror r8, 8 ; (ror does not change ZF)
98 cmp rsi, r9 ; cmp in+len-8 with in
103 add r9, 8 ; tmp = in+len
106 ; handle the last bytes, one by one
110 cmp r9, rsi ; cmp in with in+len
111 jle Lfinished ; jump if (in+len <= in)
113 mov ebx, [rbp+rdx*8] ; ty = d[y]
114 mov [rbp+rcx*8], ebx ; d[x] = ty
115 add bl, al ; val = ty + tx
116 mov [rbp+rdx*8], eax ; d[y] = tx
117 inc cl ; x++ (NEXT ROUND)
118 mov eax, [rbp+rcx*8] ; tx = d[x] (NEXT ROUND)
119 mov r8b, [rbp+rbx*8] ; val = d[val]
120 xor r8b, [rsi] ; xor 1 byte
128 mov [rbp-8], dl ; key->y = y
129 mov [rbp-16], cl ; key->x = x