mozilla/security/nss/lib/freebl/arcfour-amd64-masm.asm

   1 ; ***** BEGIN LICENSE BLOCK *****
   2 ; Version: MPL 1.1/GPL 2.0/LGPL 2.1
   3 ;
   4 ; The contents of this file are subject to the Mozilla Public License Version
   5 ; 1.1 (the "License"); you may not use this file except in compliance with
   6 ; the License. You may obtain a copy of the License at
   7 ; http://www.mozilla.org/MPL/
   8 ;
   9 ; Software distributed under the License is distributed on an "AS IS" basis,
  10 ; WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  11 ; for the specific language governing rights and limitations under the
  12 ; License.
  13 ;
  14 ; The Original Code is "Marc Bevand's fast AMD64 ARCFOUR source"
  15 ;
  16 ; The Initial Developer of the Original Code is
  17 ; Marc Bevand <bevand_m@epita.fr> .
  18 ; Portions created by the Initial Developer are
  19 ; Copyright (C) 2004 the Initial Developer. All Rights Reserved.
  20 ;
  21 ; Contributor(s): Makoto Kato (m_kato@ga2.so-net.ne.jp)
  22 ;
  23 ; Alternatively, the contents of this file may be used under the terms of
  24 ; either the GNU General Public License Version 2 or later (the "GPL"), or
  25 ; the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  26 ; in which case the provisions of the GPL or the LGPL are applicable instead
  27 ; of those above. If you wish to allow use of your version of this file only
  28 ; under the terms of either the GPL or the LGPL, and not to allow others to
  29 ; use your version of this file under the terms of the MPL, indicate your
  30 ; decision by deleting the provisions above and replace them with the notice
  31 ; and other provisions required by the GPL or the LGPL. If you do not delete
  32 ; the provisions above, a recipient may use your version of this file under
  33 ; the terms of any one of the MPL, the GPL or the LGPL.
  34 ;
  35 ; ***** END LICENSE BLOCK *****
  36
  37 ; ** ARCFOUR implementation optimized for AMD64.
  38 ; **
  39 ; ** The throughput achieved by this code is about 320 MBytes/sec, on
  40 ; ** a 1.8 GHz AMD Opteron (rev C0) processor.
  41
  42 .CODE
  43
  44 ; extern void ARCFOUR(RC4Context *cx, unsigned long long inputLen,
  45 ;                     const unsigned char *input, unsigned char *output);
  46
  47
  48 ARCFOUR PROC
  49
  50         push    rbp
  51         push    rbx
  52         push    rsi
  53         push    rdi
  54
  55         mov     rbp, rcx                        ; key = ARG(key)
  56         mov     rbx, rdx                        ; rbx = ARG(len)
  57         mov     rsi, r8                         ; in = ARG(in)
  58         mov     rdi, r9                         ; out = ARG(out)
  59         mov     rcx, [rbp]                      ; x = key->x
  60         mov     rdx, [rbp+8]                    ; y = key->y
  61         add     rbp, 16                         ; d = key->data
  62         inc     rcx                             ; x++
  63         and     rcx, 0ffh                       ; x &= 0xff
  64         lea     rbx, [rbx+rsi-8]                ; rbx = in+len-8
  65         mov     r9, rbx                         ; tmp = in+len-8
  66         mov     rax, [rbp+rcx*8]                ; tx = d[x]
  67         cmp     rbx, rsi                        ; cmp in with in+len-8
  68         jl      Lend                            ; jump if (in+len-8 < in)
  69
  70 Lstart:
  71         add     rsi, 8                          ; increment in
  72         add     rdi, 8                          ; increment out
  73
  74         ;
  75         ; generate the next 8 bytes of the rc4 stream into r8
  76         ;
  77
  78         mov     r11, 8                          ; byte counter
  79
  80 @@:
  81         add     dl, al                          ; y += tx
  82         mov     ebx, [rbp+rdx*8]                ; ty = d[y]
  83         mov     [rbp+rcx*8], ebx                ; d[x] = ty
  84         add     bl, al                          ; val = ty + tx
  85         mov     [rbp+rdx*8], eax                ; d[y] = tx
  86         inc     cl                              ; x++ (NEXT ROUND)
  87         mov     eax, [rbp+rcx*8]                ; tx = d[x] (NEXT ROUND)
  88         mov     r8b, [rbp+rbx*8]                ; val = d[val]
  89         dec     r11b
  90         ror     r8, 8                           ; (ror does not change ZF)
  91         jnz     @b
  92
  93         ;
  94         ; xor 8 bytes
  95         ;
  96
  97         xor     r8, [rsi-8]
  98         cmp     rsi, r9                         ; cmp in+len-8 with in
  99         mov     [rdi-8], r8
 100         jle     Lstart
 101
 102 Lend:
 103         add     r9, 8                           ; tmp = in+len
 104
 105         ;
 106         ; handle the last bytes, one by one
 107         ;
 108
 109 @@:
 110         cmp     r9, rsi                         ; cmp in with in+len
 111         jle     Lfinished                       ; jump if (in+len <= in)
 112         add     dl, al                          ; y += tx
 113         mov     ebx, [rbp+rdx*8]                ; ty = d[y]
 114         mov     [rbp+rcx*8], ebx                ; d[x] = ty
 115         add     bl, al                          ; val = ty + tx
 116         mov     [rbp+rdx*8], eax                ; d[y] = tx
 117         inc     cl                              ; x++ (NEXT ROUND)
 118         mov     eax, [rbp+rcx*8]                ; tx = d[x] (NEXT ROUND)
 119         mov     r8b, [rbp+rbx*8]                ; val = d[val]
 120         xor     r8b, [rsi]                      ; xor 1 byte
 121         mov     [rdi], r8b
 122         inc     rsi                             ; in++
 123         inc     rdi
 124         jmp     @b
 125
 126 Lfinished:
 127         dec     rcx                             ; x--
 128         mov     [rbp-8], dl                     ; key->y = y
 129         mov     [rbp-16], cl                    ; key->x = x
 130
 131         pop     rdi
 132         pop     rsi
 133         pop     rbx
 134         pop     rbp
 135         ret
 136
 137 ARCFOUR ENDP
 138
 139 END