arm/neon/salsa20-core-internal.asm

   1 C nettle, low-level cryptographics library
   2 C
   3 C Copyright (C) 2013 Niels Möller
   4 C
   5 C The nettle library is free software; you can redistribute it and/or modify
   6 C it under the terms of the GNU Lesser General Public License as published by
   7 C the Free Software Foundation; either version 2.1 of the License, or (at your
   8 C option) any later version.
   9 C
  10 C The nettle library is distributed in the hope that it will be useful, but
  11 C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  12 C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  13 C License for more details.
  14 C
  15 C You should have received a copy of the GNU Lesser General Public License
  16 C along with the nettle library; see the file COPYING.LIB.  If not, write to
  17 C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 C MA 02111-1301, USA.
  19
  20         .file "salsa20-core-internal.asm"
  21         .fpu    neon
  22
  23 define(<DST>, <r0>)
  24 define(<SRC>, <r1>)
  25 define(<ROUNDS>, <r2>)
  26
  27 define(<X0>, <q0>)
  28 define(<X1>, <q1>)
  29 define(<X2>, <q2>)
  30 define(<X3>, <q3>)
  31 define(<T0>, <q8>)
  32 define(<T1>, <q9>)
  33 define(<M0101>, <q10>)
  34 define(<M0110>, <q11>)
  35 define(<M0011>, <q12>)
  36 define(<S1>, <q13>)
  37 define(<S2>, <q14>)
  38 define(<S3>, <q15>)
  39
  40 define(<QROUND>, <
  41         vadd.i32        T0, $1, $4
  42         vshl.i32        T1, T0, #7
  43         vshr.u32        T0, T0, #25
  44         veor            $2, $2, T0
  45         veor            $2, $2, T1
  46
  47         vadd.i32        T0, $1, $2
  48         vshl.i32        T1, T0, #9
  49         vshr.u32        T0, T0, #23
  50         veor            $3, $3, T0
  51         veor            $3, $3, T1
  52
  53         vadd.i32        T0, $2, $3
  54         vshl.i32        T1, T0, #13
  55         vshr.u32        T0, T0, #19
  56         veor            $4, $4, T0
  57         veor            $4, $4, T1
  58
  59         vadd.i32        T0, $3, $4
  60         vshl.i32        T1, T0, #18
  61         vshr.u32        T0, T0, #14
  62         veor            $1, $1, T0
  63         veor            $1, $1, T1
  64 >)
  65
  66         .text
  67         .align 4
  68 .Lmasks:
  69         .int 0,-1, 0,-1
  70         .int 0,-1,-1, 0
  71         .int 0, 0,-1,-1
  72
  73         C _salsa20_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
  74
  75 PROLOGUE(_nettle_salsa20_core)
  76         vldm    SRC, {X0,X1,X2,X3}
  77
  78         C Input rows:
  79         C        0  1  2  3     X0
  80         C        4  5  6  7     X1
  81         C        8  9 10 11     X2
  82         C       12 13 14 15     X3
  83         C Permuted to:
  84         C        0  5 10 15
  85         C        4  9 14  3
  86         C        8 13  2  7
  87         C       12  1  6 11
  88
  89         C FIXME: Construct in some other way?
  90         adr     r12, .Lmasks
  91         vldm    r12, {M0101, M0110, M0011}
  92
  93         vmov    S1, X1
  94         vmov    S2, X2
  95         vmov    S3, X3
  96
  97         C Swaps in columns 1, 3:
  98         C        0  5  2  7     X0 ^
  99         C        4  1  6  3     T0 v
 100         C        8 13 10 15     T1  ^
 101         C       12  9 14 11     X3  v
 102         vmov    T0, X1
 103         vmov    T1, X2
 104         vbit    T0, X0, M0101
 105         vbit    X0, X1, M0101
 106         vbit    T1, X3, M0101
 107         vbit    X3, X2, M0101
 108
 109         C Swaps in column 1, 2:
 110         C        0  5  2  7     X0
 111         C        4  9 14  3     X1 ^
 112         C        8 13 10 15     T1 |
 113         C       12  1  6 11     X3 v
 114         vmov    X1, T0
 115         vbit    X1, X3, M0110
 116         vbit    X3, T0, M0110
 117
 118         C Swaps in columm 2,3:
 119         C        0  5 10 15     X0 ^
 120         C        4  9 14  3     X1 |
 121         C        8 13  2  7     X2 v
 122         C       12  1  6 11     X3
 123         vmov    X2, T1
 124         vbit    X2, X0, M0011
 125         vbit    X0, T1, M0011
 126
 127 .Loop:
 128         QROUND(X0, X1, X2, X3)
 129
 130         C Rotate rows, to get
 131         C        0  5 10 15
 132         C        3  4  9 14  >>> 1
 133         C        2  7  8 13  >>> 2
 134         C        1  6 11 12  >>> 3
 135         vext.32 X1, X1, X1, #3
 136         vext.32 X2, X2, X2, #2
 137         vext.32 X3, X3, X3, #1
 138
 139         QROUND(X0, X3, X2, X1)
 140
 141         subs    ROUNDS, ROUNDS, #2
 142         C Inverse rotation
 143         vext.32 X1, X1, X1, #1
 144         vext.32 X2, X2, X2, #2
 145         vext.32 X3, X3, X3, #3
 146
 147         bhi     .Loop
 148
 149         C Inverse swaps
 150         vmov    T1, X2
 151         vbit    T1, X0, M0011
 152         vbit    X0, X2, M0011
 153
 154         vmov    T0, X1
 155         vbit    T0, X3, M0110
 156         vbit    X3, X1, M0110
 157
 158         vmov    X1, T0
 159         vmov    X2, T1
 160         vbit    X1, X0, M0101
 161         vbit    X0, T0, M0101
 162         vbit    X2, X3, M0101
 163         vbit    X3, T1, M0101
 164
 165         vld1.64 {T0}, [SRC]
 166         vadd.u32        X0, X0, T0
 167         vadd.u32        X1, X1, S1
 168         vadd.u32        X2, X2, S2
 169         vadd.u32        X3, X3, S3
 170
 171         vstm    DST, {X0,X1,X2,X3}
 172         bx      lr
 173 EPILOGUE(_nettle_salsa20_core)
 174
 175 divert(-1)
 176 define salsastate
 177 p/x $q0.u32
 178 p/x $q1.u32
 179 p/x $q2.u32
 180 p/x $q3.u32
 181 end