x86_64/salsa20-crypt.asm

   1 C nettle, low-level cryptographics library
   2 C
   3 C Copyright (C) 2012 Niels Möller
   4 C
   5 C The nettle library is free software; you can redistribute it and/or modify
   6 C it under the terms of the GNU Lesser General Public License as published by
   7 C the Free Software Foundation; either version 2.1 of the License, or (at your
   8 C option) any later version.
   9 C
  10 C The nettle library is distributed in the hope that it will be useful, but
  11 C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  12 C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  13 C License for more details.
  14 C
  15 C You should have received a copy of the GNU Lesser General Public License
  16 C along with the nettle library; see the file COPYING.LIB.  If not, write to
  17 C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 C MA 02111-1301, USA.
  19
  20 define(<CTX>, <%rdi>)
  21 define(<LENGTH>, <%rsi>)
  22 define(<DST>, <%rdx>)
  23 define(<SRC>, <%rcx>)
  24 define(<T64>, <%r8>)
  25 define(<POS>, <%r9>)
  26 define(<X0>, <%xmm0>)
  27 define(<X1>, <%xmm1>)
  28 define(<X2>, <%xmm2>)
  29 define(<X3>, <%xmm3>)
  30 define(<T0>, <%xmm4>)
  31 define(<T1>, <%xmm5>)
  32 define(<M0101>, <%xmm6>)
  33 define(<M0110>, <%xmm7>)
  34 define(<M0011>, <%xmm8>)
  35 define(<COUNT>, <%rax>)
  36
  37 include_src(<x86_64/salsa20.m4>)
  38
  39 C Possible improvements:
  40 C
  41 C Do two blocks (or more) at a time in parallel, to avoid limitations
  42 C due to data dependencies.
  43 C
  44 C Avoid redoing the permutation of the input for each block (all but
  45 C the two counter words are constant). Could also keep the input in
  46 C registers.
  47
  48         .file "salsa20-crypt.asm"
  49
  50         C salsa20_crypt(struct salsa20_ctx *ctx, unsigned length,
  51         C               uint8_t *dst, const uint8_t *src)
  52         .text
  53         ALIGN(16)
  54 PROLOGUE(nettle_salsa20_crypt)
  55         W64_ENTRY(4, 9)
  56
  57         test    LENGTH, LENGTH
  58         jz      .Lend
  59
  60         C Load mask registers
  61         mov     $-1, XREG(COUNT)
  62         movd    XREG(COUNT), M0101
  63         pshufd  $0x09, M0101, M0011     C 01 01 00 00
  64         pshufd  $0x41, M0101, M0110     C 01 00 00 01
  65         pshufd  $0x22, M0101, M0101     C 01 00 01 00
  66
  67 .Lblock_loop:
  68         movups  (CTX), X0
  69         movups  16(CTX), X1
  70         movups  32(CTX), X2
  71         movups  48(CTX), X3
  72
  73         C On input, each xmm register is one row. We start with
  74         C
  75         C        0  1  2  3     C K K K
  76         C        4  5  6  7     K C I I
  77         C        8  9 10 11     B B C K
  78         C       12 13 14 15     K K K C
  79         C
  80         C Diagrams are in little-endian order, with least significant word to
  81         C the left. We rotate the columns, to get instead
  82         C
  83         C        0  5 10 15     C C C C
  84         C        4  9 14  3     K B K K
  85         C        8 13  2  7     B K K I
  86         C       12  1  6 11     K K I K
  87         C
  88         C The original rows are now diagonals.
  89         SWAP(X0, X1, M0101)
  90         SWAP(X2, X3, M0101)
  91         SWAP(X1, X3, M0110)
  92         SWAP(X0, X2, M0011)
  93
  94         movl    $10, XREG(COUNT)
  95         ALIGN(16)
  96 .Loop:
  97         QROUND(X0, X1, X2, X3)
  98         C For the row operations, we first rotate the rows, to get
  99         C
 100         C       0 5 10 15
 101         C       3 4  9 14
 102         C       2 7  8 13
 103         C       1 6 11 12
 104         C
 105         C Now the original rows are turned into into columns. (This
 106         C SIMD hack described in djb's papers).
 107
 108         pshufd  $0x93, X1, X1   C       11 00 01 10 (least sign. left)
 109         pshufd  $0x4e, X2, X2   C       10 11 00 01
 110         pshufd  $0x39, X3, X3   C       01 10 11 00
 111
 112         QROUND(X0, X3, X2, X1)
 113
 114         C Inverse rotation of the rows
 115         pshufd  $0x39, X1, X1   C       01 10 11 00
 116         pshufd  $0x4e, X2, X2   C       10 11 00 01
 117         pshufd  $0x93, X3, X3   C       11 00 01 10
 118
 119         decl    XREG(COUNT)
 120         jnz     .Loop
 121
 122         SWAP(X0, X2, M0011)
 123         SWAP(X1, X3, M0110)
 124         SWAP(X0, X1, M0101)
 125         SWAP(X2, X3, M0101)
 126
 127         movups  (CTX), T0
 128         movups  16(CTX), T1
 129         paddd   T0, X0
 130         paddd   T1, X1
 131         movups  32(CTX), T0
 132         movups  48(CTX), T1
 133         paddd   T0, X2
 134         paddd   T1, X3
 135
 136         C Increment block counter
 137         incq    32(CTX)
 138
 139         cmp     $64, LENGTH
 140         jc      .Lfinal_xor
 141
 142         movups  48(SRC), T1
 143         pxor    T1, X3
 144         movups  X3, 48(DST)
 145 .Lxor3:
 146         movups  32(SRC), T0
 147         pxor    T0, X2
 148         movups  X2, 32(DST)
 149 .Lxor2:
 150         movups  16(SRC), T1
 151         pxor    T1, X1
 152         movups  X1, 16(DST)
 153 .Lxor1:
 154         movups  (SRC), T0
 155         pxor    T0, X0
 156         movups  X0, (DST)
 157
 158         lea     64(SRC), SRC
 159         lea     64(DST), DST
 160         sub     $64, LENGTH
 161         ja      .Lblock_loop
 162 .Lend:
 163         W64_EXIT(4, 9)
 164         ret
 165
 166 .Lfinal_xor:
 167         cmp     $32, LENGTH
 168         jz      .Lxor2
 169         jc      .Llt32
 170         cmp     $48, LENGTH
 171         jz      .Lxor3
 172         jc      .Llt48
 173         movaps  X3, T0
 174         call    .Lpartial
 175         jmp     .Lxor3
 176 .Llt48:
 177         movaps  X2, T0
 178         call    .Lpartial
 179         jmp     .Lxor2
 180 .Llt32:
 181         cmp     $16, LENGTH
 182         jz      .Lxor1
 183         jc      .Llt16
 184         movaps  X1, T0
 185         call    .Lpartial
 186         jmp     .Lxor1
 187 .Llt16:
 188         movaps  X0, T0
 189         call    .Lpartial
 190         jmp     .Lend
 191
 192 .Lpartial:
 193         mov     LENGTH, POS
 194         and     $-16, POS
 195         test    $8, LENGTH
 196         jz      .Llt8
 197         C This "movd" instruction should assemble to
 198         C 66 49 0f 7e e0          movq   %xmm4,%r8
 199         C Apparently, assemblers treat movd and movq (with the
 200         C arguments we use) in the same way, except for osx, which
 201         C barfs at movq.
 202         movd    T0, T64
 203         xor     (SRC, POS), T64
 204         mov     T64, (DST, POS)
 205         lea     8(POS), POS
 206         pshufd  $0xee, T0, T0           C 10 11 10 11
 207 .Llt8:
 208         C And this is also really a movq.
 209         movd    T0, T64
 210         test    $4, LENGTH
 211         jz      .Llt4
 212         mov     XREG(T64), XREG(COUNT)
 213         xor     (SRC, POS), XREG(COUNT)
 214         mov     XREG(COUNT), (DST, POS)
 215         lea     4(POS), POS
 216         shr     $32, T64
 217 .Llt4:
 218         test    $2, LENGTH
 219         jz      .Llt2
 220         mov     WREG(T64), WREG(COUNT)
 221         xor     (SRC, POS), WREG(COUNT)
 222         mov     WREG(COUNT), (DST, POS)
 223         lea     2(POS), POS
 224         shr     $16, XREG(T64)
 225 .Llt2:
 226         test    $1, LENGTH
 227         jz      .Lret
 228         xor     (SRC, POS), LREG(T64)
 229         mov     LREG(T64), (DST, POS)
 230
 231 .Lret:
 232         ret
 233
 234 EPILOGUE(nettle_salsa20_crypt)