arm/neon/umac-nh-n.asm

   1 C nettle, low-level cryptographics library
   2 C
   3 C Copyright (C) 2013 Niels Möller
   4 C
   5 C The nettle library is free software; you can redistribute it and/or modify
   6 C it under the terms of the GNU Lesser General Public License as published by
   7 C the Free Software Foundation; either version 2.1 of the License, or (at your
   8 C option) any later version.
   9 C
  10 C The nettle library is distributed in the hope that it will be useful, but
  11 C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  12 C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  13 C License for more details.
  14 C
  15 C You should have received a copy of the GNU Lesser General Public License
  16 C along with the nettle library; see the file COPYING.LIB.  If not, write to
  17 C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  18 C MA 02111-1301, USA.
  19
  20         .file "umac-nh.asm"
  21         .fpu    neon
  22
  23 define(<OUT>, <r0>)
  24 define(<ITERS>, <r1>)
  25 define(<KEY>, <r2>)
  26 define(<LENGTH>, <r3>)
  27 define(<MSG>, <r12>)
  28 define(<SHIFT>, <r14>)
  29
  30 define(<QA>, <q0>)
  31 define(<QB>, <q1>)
  32 define(<QY0>, <q3>)     C Accumulates for the first two operations.
  33 define(<DM>, <d4>)
  34 define(<QY1>, <q4>)     C Used for 3 and 4 iterations.
  35 define(<QC>, <q5>)
  36 define(<QD>, <q6>)
  37 define(<QLEFT>, <q8>)
  38 define(<QRIGHT>, <q9>)
  39 define(<QT0>, <q10>)
  40 define(<QT1>, <q11>)
  41 define(<QT2>, <q12>)
  42 define(<QK0>, <q13>)
  43 define(<QK1>, <q14>)
  44 define(<QK2>, <q15>)
  45
  46 C FIXME: Try permuting subkeys using vld4, vzip or similar.
  47
  48         .text
  49         .align  3
  50
  51 PROLOGUE(_nettle_umac_nh_n)
  52         ldr     MSG, [sp]
  53         str     lr, [sp, #-4]!
  54
  55         C Setup for 64-bit aligned reads
  56         ands    SHIFT, MSG, #7
  57         and     MSG, MSG, #-8
  58         vld1.8  {DM}, [MSG :64]
  59         addne   MSG, MSG, #8
  60         addeq   SHIFT, SHIFT, #8
  61
  62         C FIXME: Combine as rsb ?
  63         lsl     SHIFT, SHIFT, #3
  64         neg     SHIFT, SHIFT
  65
  66         C Right shift in QRIGHT (both halves)
  67         vmov.i32 D0REG(QRIGHT)[0], SHIFT
  68         vmov.32  D1REG(QRIGHT), D0REG(QRIGHT)
  69         add     SHIFT, SHIFT, #64
  70
  71         vmov.i32 D0REG(QLEFT)[0], SHIFT
  72         vmov.32  D1REG(QLEFT), D0REG(QLEFT)
  73         cmp     r1, #3
  74         vmov.i64 QY0, #0
  75
  76         vshl.u64 DM, DM, D0REG(QRIGHT)
  77         bcc     .Lnh2
  78         beq     .Lnh3
  79
  80 .Lnh4:
  81         C Permute key words, so we in each iteration have them in order
  82         C
  83         C P0: [0, 4,1, 5] P1: [ 2, 6, 3, 7] P2: [ 4, 8, 5, 9] P3: [ 6,10, 7,11]
  84         C P4: [8,12,9,13] P5: [10,14,11,15] P6: [12,16,13,17] P7: [14,18,15,19]
  85         C
  86         C Also arrange the message words, so we get them as
  87         C M0: [0,0,1,1] M1: [ 2, 2, 3, 3] M2: [ 4, 4, 5, 5] M3: [ 6, 6, 7, 7]
  88         C M4: [8,8,9,9] M5: [10,10,11,11] M6: [12,12,13,13] M7: [14,14,15,15]
  89         C
  90         C Then, accumulate Y0 (first two "iters") using
  91         C
  92         C Y0 += (M0+P0) * (M2+P2) + (M1+P1) * (M3+P3)
  93         C Y1 += (M0+P4) * (M2+P6) + (M1+P5) * (M3+P7)
  94         C
  95         C Next iteration is then
  96         C
  97         C Y0 += (M4+P4) * (M6+P6) + (M5+P5) * (M7 + P7)
  98         C Y1 += (M4+P6) * (M6+P8) + (M5+P7) * (M7 + P11)
  99         C
 100         C So we can reuse P4, P5, P6, P7 from the previous iteration.
 101
 102         C How to for in registers? We need 4 Q regs for P0-P3, and one
 103         C more for the last read key. We need at least two regiters
 104         C for the message (QA and QB, more if we want to expand only
 105         C once). For the Y0 update, we can let the factors overwrite
 106         C P0-P3, and for the Y1 update, we can overwrite M0-M3.
 107
 108         vpush   {q4,q5,q6}
 109         vld1.32 {QK0,QK1}, [KEY]!
 110         vld1.32 {QK2}, [KEY]!
 111         vmov    QT0, QK1
 112         vmov    QT1, QK2
 113
 114         C Permute keys. QK2 us untouched, permuted subkeys put in QK0,QK1,QT0,QT1
 115         vtrn.32 QK0, QK1                C Gives us [0, 4, 2, 6] and [1, 5, 3, 7]
 116         vswp D1REG(QK0), D0REG(QK1)     C Gives us [0, 4, 1, 5] and [2, 6, 3, 7]
 117         vtrn.32 QT0, QT1                C Gives us [4,8,6,10] and [5 ,9,7,11]
 118         vswp D1REG(QT0), D0REG(QT1)     C Gives us [4,8,5, 9] and [6,10,7,11]
 119
 120         vmov.i64 QY1, #0
 121 .Loop4:
 122         C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
 123         vld1.8 {QA, QB}, [MSG :64]!
 124         vshl.u64 QC, QA, QRIGHT
 125         vshl.u64 QD, QB, QRIGHT
 126         vshl.u64 QA, QA, QLEFT
 127         vshl.u64 QB, QB, QLEFT
 128         veor    D0REG(QA), D0REG(QA), DM
 129         veor    D1REG(QA), D1REG(QA), D0REG(QC)
 130         veor    D0REG(QB), D0REG(QB), D1REG(QC)
 131         veor    D1REG(QB), D1REG(QB), D0REG(QD)
 132         vmov    DM, D1REG(QD)
 133
 134         C Explode message (too bad there's no vadd with scalar)
 135         vdup.32 D1REG(QD), D1REG(QB)[1]
 136         vdup.32 D0REG(QD), D1REG(QB)[0]
 137         vdup.32 D1REG(QC), D0REG(QB)[1]
 138         vdup.32 D0REG(QC), D0REG(QB)[0]
 139         vdup.32 D1REG(QB), D1REG(QA)[1]
 140         vdup.32 D0REG(QB), D1REG(QA)[0]
 141         vdup.32 D1REG(QA), D0REG(QA)[1]
 142         vdup.32 D0REG(QA), D0REG(QA)[0]
 143
 144         vadd.i32 QK0, QK0, QA
 145         vadd.i32 QK1, QK1, QB
 146         vadd.i32 QT0, QT0, QC
 147         vadd.i32 QT1, QT1, QD
 148
 149         vmlal.u32 QY0, D0REG(QK0), D0REG(QT0)
 150         vmlal.u32 QY0, D1REG(QK0), D1REG(QT0)
 151         vmlal.u32 QY0, D0REG(QK1), D0REG(QT1)
 152         vmlal.u32 QY0, D1REG(QK1), D1REG(QT1)
 153
 154         C Next 4 subkeys
 155         vld1.32 {QT0,QT1}, [KEY]!
 156         vmov    QK0, QK2
 157         vmov    QK1, QT0
 158         vmov    QK2, QT1                C Save
 159         vtrn.32 QK0, QK1                C Gives us [8,12,10,14] and [9,13,11,15]
 160         vswp D1REG(QK0), D0REG(QK1)     C Gives us [8,12,9,13] and [10,14,11,15]
 161         vtrn.32 QT0, QT1                C Gives us [12,16,14,18] and [13,17,15,19]
 162         vswp D1REG(QT0), D0REG(QT1)     C Gives us [12,16,13,17] and [14,18,15,19]
 163
 164         vadd.i32 QA, QA, QK0
 165         vadd.i32 QB, QB, QK1
 166         vadd.i32 QC, QC, QT0
 167         vadd.i32 QD, QD, QT1
 168
 169         subs    LENGTH, LENGTH, #32
 170
 171         vmlal.u32 QY1, D0REG(QA), D0REG(QC)
 172         vmlal.u32 QY1, D1REG(QA), D1REG(QC)
 173         vmlal.u32 QY1, D0REG(QB), D0REG(QD)
 174         vmlal.u32 QY1, D1REG(QB), D1REG(QD)
 175
 176         bhi     .Loop4
 177
 178         vst1.64 {QY0, QY1}, [OUT]
 179
 180         vpop    {q4,q5,q6}
 181
 182         ldr     pc, [sp], #+4
 183
 184 .Lnh3:
 185         vpush   {q4}
 186         vld1.32 {QK0,QK1}, [KEY]!
 187         vmov.i64 QY1, #0
 188 .Loop3:
 189         C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
 190         vld1.8 {QA, QB}, [MSG :64]!
 191         vshl.u64 QT0, QA, QRIGHT
 192         vshl.u64 QT1, QB, QRIGHT
 193         vshl.u64 QA, QA, QLEFT
 194         vshl.u64 QB, QB, QLEFT
 195         veor    D0REG(QA), D0REG(QA), DM
 196         veor    D1REG(QA), D1REG(QA), D0REG(QT0)
 197         veor    D0REG(QB), D0REG(QB), D1REG(QT0)
 198         veor    D1REG(QB), D1REG(QB), D0REG(QT1)
 199         vmov    DM, D1REG(QT1)
 200
 201         vld1.32 {QK2}, [KEY]!
 202         C Construct factors, with low half corresponding to first iteration,
 203         C and high half corresponding to the second iteration.
 204         vmov    QT0, QK1
 205         vtrn.32 QK0, QT0                C Gives us [0, 4, 2, 6] and [1, 5, 3, 7]
 206         vswp D1REG(QK0), D0REG(QT0)     C Gives us [0, 4, 1, 5] and [2, 6, 3, 7]
 207         vdup.32 D0REG(QT1), D0REG(QA)[0]
 208         vdup.32 D1REG(QT1), D0REG(QA)[1]
 209         vadd.i32        QT1, QT1, QK0
 210
 211         vmov    QK0, QK2                C Save for next iteration
 212         vtrn.32 QK1, QK2                C Gives us [4, 8, 2, 1] and [1, 5, 3, 7]
 213         vswp    D1REG(QK1), D0REG(QK2)  C Gives us [4, 8, 1, 5] and [2, 1, 3, 7]
 214
 215         vdup.32 D0REG(QT2), D0REG(QB)[0]
 216         vdup.32 D1REG(QT2), D0REG(QB)[1]
 217         vadd.i32 QK1, QK1, QT2
 218         vmlal.u32 QY0, D0REG(QT1), D0REG(QK1)
 219         vmlal.u32 QY0, D1REG(QT1), D1REG(QK1)
 220
 221         vdup.32 D0REG(QT1), D1REG(QA)[0]
 222         vdup.32 D1REG(QT1), D1REG(QA)[1]
 223         vadd.i32        QT0, QT0, QT1
 224         vdup.32 D0REG(QT1), D1REG(QB)[0]
 225         vdup.32 D1REG(QT1), D1REG(QB)[1]
 226         vadd.i32        QK2, QK2, QT1
 227
 228         vmlal.u32 QY0, D0REG(QT0), D0REG(QK2)
 229         vmlal.u32 QY0, D1REG(QT0), D1REG(QK2)
 230
 231         vld1.32 {QK1}, [KEY]!
 232         vadd.i32 QA, QA, QK0
 233         vadd.i32 QB, QB, QK1
 234         subs    LENGTH, LENGTH, #32
 235         vmlal.u32 QY1, D0REG(QA), D0REG(QB)
 236         vmlal.u32 QY1, D1REG(QA), D1REG(QB)
 237         bhi     .Loop3
 238
 239         vadd.i64 D0REG(QY1), D0REG(QY1), D1REG(QY1)
 240         vst1.64 {D0REG(QY0), D1REG(QY0), D0REG(QY1)}, [OUT]
 241
 242         vpop    {q4}
 243
 244         ldr     pc, [sp], #+4
 245
 246 .Lnh2:
 247         vld1.32 {QK0}, [KEY]!
 248 .Loop2:
 249         C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
 250         vld1.8 {QA, QB}, [MSG :64]!
 251         vshl.u64 QT0, QA, QRIGHT
 252         vshl.u64 QT1, QB, QRIGHT
 253         vshl.u64 QA, QA, QLEFT
 254         vshl.u64 QB, QB, QLEFT
 255         veor    D0REG(QA), D0REG(QA), DM
 256         veor    D1REG(QA), D1REG(QA), D0REG(QT0)
 257         veor    D0REG(QB), D0REG(QB), D1REG(QT0)
 258         veor    D1REG(QB), D1REG(QB), D0REG(QT1)
 259         vmov    DM, D1REG(QT1)
 260
 261         vld1.32 {QK1,QK2}, [KEY]!
 262         C Construct factors, with low half corresponding to first iteration,
 263         C and high half corresponding to the second iteration.
 264         vmov    QT0, QK1
 265         vtrn.32 QK0, QT0                C Gives us [0, 4, 2, 6] and [1, 5, 3, 7]
 266         vswp D1REG(QK0), D0REG(QT0)     C Gives us [0, 4, 1, 5] and [2, 6, 3, 7]
 267         vdup.32 D0REG(QT1), D0REG(QA)[0]
 268         vdup.32 D1REG(QT1), D0REG(QA)[1]
 269         vadd.i32        QT1, QT1, QK0
 270
 271         vmov    QK0, QK2                C Save for next iteration
 272         vtrn.32 QK1, QK2                C Gives us [4, 8, 6, 10] and [5,  9, 7, 11]
 273         vswp    D1REG(QK1), D0REG(QK2)  C Gives us [4, 8, 5,  9] and [6, 10, 7, 11]
 274
 275         vdup.32 D0REG(QT2), D0REG(QB)[0]
 276         vdup.32 D1REG(QT2), D0REG(QB)[1]
 277         vadd.i32 QK1, QK1, QT2
 278         vmlal.u32 QY0, D0REG(QT1), D0REG(QK1)
 279         vmlal.u32 QY0, D1REG(QT1), D1REG(QK1)
 280
 281         vdup.32 D0REG(QT1), D1REG(QA)[0]
 282         vdup.32 D1REG(QT1), D1REG(QA)[1]
 283         vadd.i32        QT0, QT0, QT1
 284         vdup.32 D0REG(QT1), D1REG(QB)[0]
 285         vdup.32 D1REG(QT1), D1REG(QB)[1]
 286         vadd.i32        QK2, QK2, QT1
 287
 288         subs    LENGTH, LENGTH, #32
 289
 290         vmlal.u32 QY0, D0REG(QT0), D0REG(QK2)
 291         vmlal.u32 QY0, D1REG(QT0), D1REG(QK2)
 292
 293         bhi     .Loop2
 294         vst1.64 {QY0}, [OUT]
 295
 296 .Lend:
 297         ldr     pc, [sp], #+4
 298 EPILOGUE(_nettle_umac_nh_n)