1 C nettle, low-level cryptographics library
3 C Copyright (C) 2013 Niels Möller
5 C The nettle library is free software; you can redistribute it and/or modify
6 C it under the terms of the GNU Lesser General Public License as published by
7 C the Free Software Foundation; either version 2.1 of the License, or (at your
8 C option) any later version.
10 C The nettle library is distributed in the hope that it will be useful, but
11 C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12 C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
13 C License for more details.
15 C You should have received a copy of the GNU Lesser General Public License
16 C along with the nettle library; see the file COPYING.LIB. If not, write to
17 C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
20 .file "sha512-compress.asm"
27 define(<SHIFT>, <r12>)
42 C d8-d15 are callee-save
53 C Used only when reading the input, can overlap with state
74 define(<QW0001>, <q8>)
75 define(<QW0203>, <q9>)
76 define(<QW0405>, <q10>)
77 define(<QW0607>, <q11>)
78 define(<QW0809>, <q12>)
79 define(<QW1011>, <q13>)
80 define(<QW1213>, <q14>)
81 define(<QW1415>, <q15>)
83 define(<EXPAND_ME>, <$1>)
84 define(<W>, <EXPAND_ME(<DW>eval(($1) % 16))>)
86 C If x = W(i+14), y = w(i+1), we xor in parallel
93 C -----------------------------
96 vshl.i64 DT0, W($1+14), #45
97 vshl.i64 DT1, W($1 + 1), #63
98 vshr.u64 DT2, W($1+14), #19
99 vshr.u64 DT3, W($1 + 1), #1
100 vshl.i64 DT4, W($1+14), #3
101 vshl.i64 DT5, W($1 + 1), #56
102 veor.i64 QT01, QT01, QT23
103 vshr.u64 DT2, W($1+14), #61
104 vshr.u64 DT3, W($1 + 1), #8
105 veor.i64 QT01, QT01, QT45
106 vshr.u64 DT4, W($1+14), #6
107 vshr.u64 DT5, W($1 + 1), #7
108 veor.i64 QT01, QT01, QT23
109 vadd.i64 W($1), W($1), W($1 + 9)
110 veor.i64 QT01, QT01, QT45
111 vadd.i64 W($1), W($1), DT0
112 vadd.i64 W($1), W($1), DT1
115 C ROUND(A,B,C,D,E,F,G,H,i)
117 C H += S1(E) + Choice(E,F,G) + K + W
119 C H += S0(A) + Majority(A,B,C)
123 C S1(E) = E<<<50 ^ E<<<46 ^ E<<<23
124 C S0(A) = A<<<36 ^ A<<<30 ^ A<<<25
125 C Choice (E, F, G) = G^(E&(F^G))
126 C Majority (A,B,C) = (A&B) + (C&(A^B))
128 C Do S1 and S0 in parallel
135 C xor e >> 41 a >> 39
136 C ----------------------------
139 vshl.i64 DT0, $5, #50
140 vshl.i64 DT1, $1, #36
141 vshr.u64 DT2, $5, #14
142 vshr.u64 DT3, $1, #28
143 vshl.i64 DT4, $5, #46
144 vshl.i64 DT5, $1, #30
145 veor QT01, QT01, QT23
146 vshr.u64 DT2, $5, #18
147 vshr.u64 DT3, $1, #34
148 veor QT01, QT01, QT45
149 vshl.i64 DT4, $5, #23
150 vshl.i64 DT5, $1, #25
151 veor QT01, QT01, QT23
152 vshr.u64 DT2, $5, #41
153 vshr.u64 DT3, $1, #39
154 veor QT01, QT01, QT45
160 veor QT01, QT01, QT23
162 vldr DT3, [K,#eval(8*$9)]
163 vadd.i64 $8, $8, W($9)
164 vadd.i64 QT01, QT01, QT45
167 vadd.i64 DT1, DT1, DT2
173 C _nettle_sha512_compress(uint64_t *state, const uint8_t *input, const uint64_t *k)
178 PROLOGUE(_nettle_sha512_compress)
179 vpush {d8,d9,d10,d11,d12,d13}
181 ands SHIFT, INPUT, #7
182 and INPUT, INPUT, #-8
183 vld1.8 {DT5}, [INPUT :64]
184 addne INPUT, INPUT, #8
185 addeq SHIFT, SHIFT, #8
188 C Put right shift in DT0 and DT1, aka QT01
191 vmov.32 DT0[0], SHIFT
193 C Put left shift in DT2 and DT3, aka QT23
194 add SHIFT, SHIFT, #64
196 vmov.32 DT2[0], SHIFT
198 vshl.u64 DT5, DT5, DT0
200 C Set w[i] <-- w[i-1] >> RSHIFT + w[i] << LSHIFT
201 vld1.8 {W(0),W(1),W(2),W(3)}, [INPUT :64]!
202 vshl.u64 QT67, QW0001, QT01 C Right shift
203 vshl.u64 QW0001, QW0001, QT23 C Left shift
206 vrev64.8 QW0001, QW0001
207 vshl.u64 QT45, QW0203, QT01 C Right shift
208 vshl.u64 QW0203, QW0203, QT23 C Left shift
211 vrev64.8 QW0203, QW0203
213 vld1.8 {W(4),W(5),W(6),W(7)}, [INPUT :64]!
214 vshl.u64 QT67, QW0405, QT01 C Right shift
215 vshl.u64 QW0405, QW0405, QT23 C Left shift
218 vrev64.8 QW0405, QW0405
219 vshl.u64 QT45, QW0607, QT01 C Right shift
220 vshl.u64 QW0607, QW0607, QT23 C Left shift
223 vrev64.8 QW0607, QW0607
225 vld1.8 {W(8),W(9),W(10),W(11)}, [INPUT :64]!
226 vshl.u64 QT67, QW0809, QT01 C Right shift
227 vshl.u64 QW0809, QW0809, QT23 C Left shift
230 vrev64.8 QW0809, QW0809
231 vshl.u64 QT45, QW1011, QT01 C Right shift
232 vshl.u64 QW1011, QW1011, QT23 C Left shift
233 veor W(10), W(10), DT7
234 veor W(11), W(11), DT4
235 vrev64.8 QW1011, QW1011
237 vld1.8 {W(12),W(13),W(14),W(15)}, [INPUT :64]!
238 vshl.u64 QT67, QW1213, QT01 C Right shift
239 vshl.u64 QW1213, QW1213, QT23 C Left shift
240 veor W(12), W(12), DT5
241 veor W(13), W(13), DT6
242 vrev64.8 QW1213, QW1213
243 vshl.u64 QT45, QW1415, QT01 C Right shift
244 vshl.u64 QW1415, QW1415, QT23 C Left shift
245 veor W(14), W(14), DT7
246 veor W(15), W(15), DT4
247 vrev64.8 QW1415, QW1415
249 vldm STATE, {SA,SB,SC,SD,SE,SF,SG,SH}
251 ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 0)
252 ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 1)
253 ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 2)
254 ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 3)
255 ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 4)
256 ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 5)
257 ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 6)
258 ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 7)
260 ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 8)
261 ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 9)
262 ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 10)
263 ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 11)
264 ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 12)
265 ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 13)
266 ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 14)
267 ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 15)
274 EXPN( 0) ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 0)
275 EXPN( 1) ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 1)
276 EXPN( 2) ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 2)
277 EXPN( 3) ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 3)
278 EXPN( 4) ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 4)
279 EXPN( 5) ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 5)
280 EXPN( 6) ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 6)
281 EXPN( 7) ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 7)
282 EXPN( 8) ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 8)
283 EXPN( 9) ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 9)
284 EXPN(10) ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 10)
285 EXPN(11) ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 11)
286 EXPN(12) ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 12)
287 EXPN(13) ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 13)
288 EXPN(14) ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 14)
289 subs COUNT, COUNT, #1
290 EXPN(15) ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 15)
294 vld1.64 {DW0, DW1, DW2, DW3}, [STATE]
295 vadd.i64 QSAB, QSAB, QW0001
296 vadd.i64 QSCD, QSCD, QW0203
297 vst1.64 {SA,SB,SC,SD}, [STATE]!
298 vld1.64 {DW0, DW1, DW2, DW3}, [STATE]
299 vadd.i64 QSEF, QSEF, QW0001
300 vadd.i64 QSGH, QSGH, QW0203
301 vst1.64 {SE,SF,SG,SH}, [STATE]!
303 vpop {d8,d9,d10,d11,d12,d13}
305 EPILOGUE(_nettle_sha512_compress)