1 /* sha512-armv7-neon.S - ARM/NEON assembly implementation of SHA-512 transform
3 * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
5 * This file is part of Libgcrypt.
7 * Libgcrypt is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as
9 * published by the Free Software Foundation; either version 2.1 of
10 * the License, or (at your option) any later version.
12 * Libgcrypt is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
23 #if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
24 defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
25 defined(HAVE_GCC_INLINE_ASM_NEON)
33 /* structure of SHA512_CONTEXT */
35 #define hd_b ((hd_a) + 8)
36 #define hd_c ((hd_b) + 8)
37 #define hd_d ((hd_c) + 8)
38 #define hd_e ((hd_d) + 8)
39 #define hd_f ((hd_e) + 8)
40 #define hd_g ((hd_f) + 8)
89 /***********************************************************************
90 * ARM assembly implementation of sha512 transform
91 ***********************************************************************/
92 #define round_0_63(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw14, rw9, rw1) \
93 /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
94 vshr.u64 RT1, re, #14; \
95 vshl.u64 RT3, re, #64 - 14; \
96 vshr.u64 RT4, re, #18; \
97 vshl.u64 RT5, re, #64 - 18; \
98 veor.64 RT1, RT1, RT3; \
99 vld1.64 {RT0}, [RK]!; \
100 veor.64 RT1, RT1, RT4; \
101 vshr.u64 RT3, re, #41; \
102 vshl.u64 RT4, re, #64 - 41; \
103 veor.64 RT1, RT1, RT5; \
104 vadd.u64 RT0, RT0, rw0; \
105 veor.64 RT1, RT1, RT3; \
106 vand.64 RT2, re, rf; \
107 veor.64 RT1, RT1, RT4; \
108 vbic.64 RT6, rg, re; \
110 vadd.u64 RT1, RT1, rh; \
111 veor.64 RT2, RT2, RT6; \
112 vshr.u64 rh, ra, #28; \
113 vshl.u64 RT3, ra, #64 - 28; \
114 vadd.u64 RT1, RT1, RT0; \
115 vshr.u64 RT4, ra, #34; \
116 veor.64 rh, rh, RT3; \
117 vshl.u64 RT5, ra, #64 - 34; \
118 vadd.u64 RT1, RT1, RT2; \
120 /* h = Sum0 (a) + Maj (a, b, c); */ \
121 veor.64 rh, rh, RT4; \
122 vshr.u64 RT3, ra, #39; \
123 vshl.u64 RT4, ra, #64 - 39; \
124 vorr.64 RT6, ra, rb; \
125 vand.64 RT0, ra, rb; \
126 veor.64 rh, rh, RT5; \
127 vand.64 RT6, RT6, rc; \
128 veor.64 rh, rh, RT3; \
129 vorr.64 RT0, RT0, RT6; \
130 veor.64 rh, rh, RT4; \
131 vshr.u64 RT4, rw14, #19; \
132 vadd.u64 rh, rh, RT0; \
133 vshl.u64 RT2, rw14, #64 - 19; \
135 /* w[0] += S1 (w[14]) + w[9] + S0 (w[1]); */ \
136 vshr.u64 RT3, rw14, #61; \
137 vshl.u64 RT6, rw14, #64 - 61; \
138 veor.64 RT0, RT4, RT2; \
139 vshr.u64 RT2, rw14, 6; \
140 veor.64 RT0, RT0, RT3; \
141 vshr.u64 RT7, rw1, #1; \
142 veor.64 RT0, RT0, RT6; \
143 vshl.u64 RT4, rw1, #64 - 1; \
144 veor.64 RT0, RT0, RT2; \
145 vshr.u64 RT5, rw1, #8; \
146 vadd.u64 rw0, rw0, RT0; \
147 vshl.u64 RT6, rw1, #64 - 8; \
148 veor.64 RT7, RT7, RT4; \
149 vshr.u64 RT4, rw1, 7; \
150 veor.64 RT7, RT7, RT5; \
151 vadd.u64 rw0, rw0, rw9; /* w[0]+=w[9]; */\
152 veor.64 RT7, RT7, RT6; \
153 vadd.u64 rd, rd, RT1; /* d+=t1; */ \
154 veor.64 RT7, RT7, RT4; \
155 vadd.u64 rh, rh, RT1; /* h+=t1; */ \
156 vadd.u64 rw0, rw0, RT7; \
158 #define round_64_79(ra, rb, rc, rd, re, rf, rg, rh, rw0) \
159 /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
160 vld1.64 {RT0}, [RK]!; \
161 vshr.u64 RT1, re, #14; \
162 vshl.u64 RT3, re, #64 - 14; \
163 vshr.u64 RT4, re, #18; \
164 vshl.u64 RT5, re, #64 - 18; \
165 veor.64 RT1, RT1, RT3; \
166 vshr.u64 RT7, ra, #28; \
167 veor.64 RT1, RT1, RT4; \
168 vshr.u64 RT3, re, #41; \
169 vshl.u64 RT4, re, #64 - 41; \
170 veor.64 RT1, RT1, RT5; \
171 vadd.u64 RT0, RT0, rw0; \
172 veor.64 RT1, RT1, RT3; \
173 vand.64 RT2, re, rf; \
174 veor.64 RT1, RT1, RT4; \
175 vbic.64 RT6, rg, re; \
177 vadd.u64 RT1, RT1, rh; \
178 veor.64 RT2, RT2, RT6; \
179 vadd.u64 RT1, RT1, RT0; \
180 vshr.u64 RT4, ra, #34; \
181 vshl.u64 RT5, ra, #64 - 34; \
183 /* t7 = Sum0 (a) + Maj (a, b, c); */ \
184 vshl.u64 RT6, ra, #64 - 28; \
185 veor.64 RT7, RT7, RT4; \
186 vshr.u64 RT3, ra, #39; \
187 veor.64 RT7, RT7, RT6; \
188 vshl.u64 RT4, ra, #64 - 39; \
189 vorr.64 RT6, ra, rb; \
190 vand.64 RT0, ra, rb; \
191 veor.64 RT7, RT7, RT5; \
192 vand.64 RT6, RT6, rc; \
193 veor.64 RT7, RT7, RT3; \
194 vorr.64 RT0, RT0, RT6; \
195 veor.64 RT7, RT7, RT4; \
196 vadd.u64 RT1, RT1, RT2; \
197 vadd.u64 RT7, RT7, RT0; \
198 vadd.u64 rd, rd, RT1; /* d+=t1; */ \
199 vadd.u64 rh, RT7, RT1; /* h=t7+t1; */
202 .globl _gcry_sha512_transform_armv7_neon
203 .type _gcry_sha512_transform_armv7_neon,%function;
205 _gcry_sha512_transform_armv7_neon:
207 * %r0: SHA512_CONTEXT
209 * %r2: u64 k[] constants
213 /* Load context to d0-d7 */
214 vld1.64 {RA-RD}, [%r0]!;
215 vld1.64 {RE-RH}, [%r0];
218 /* Load input to w[16], d16-d31 */
219 /* NOTE: Assumes that on ARMv7 unaligned accesses are always allowed. */
220 vld1.64 {RW0-RW3}, [%r1]!;
221 vld1.64 {RW4-RW7}, [%r1]!;
222 vld1.64 {RW8-RW11}, [%r1]!;
223 vld1.64 {RW12-RW15}, [%r1];
226 vrev64.8 RW01q, RW01q;
227 vrev64.8 RW23q, RW23q;
228 vrev64.8 RW45q, RW45q;
229 vrev64.8 RW67q, RW67q;
230 vrev64.8 RW89q, RW89q;
231 vrev64.8 RW1011q, RW1011q;
232 vrev64.8 RW1213q, RW1213q;
233 vrev64.8 RW1415q, RW1415q;
236 /* EABI says that d8-d15 must be preserved by callee. */
241 round_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW14, RW9, RW1);
243 round_0_63(RH, RA, RB, RC, RD, RE, RF, RG, RW1, RW15, RW10, RW2);
244 round_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW0, RW11, RW3);
245 round_0_63(RF, RG, RH, RA, RB, RC, RD, RE, RW3, RW1, RW12, RW4);
246 round_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW2, RW13, RW5);
247 round_0_63(RD, RE, RF, RG, RH, RA, RB, RC, RW5, RW3, RW14, RW6);
248 round_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW4, RW15, RW7);
249 round_0_63(RB, RC, RD, RE, RF, RG, RH, RA, RW7, RW5, RW0, RW8);
250 round_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW6, RW1, RW9);
251 round_0_63(RH, RA, RB, RC, RD, RE, RF, RG, RW9, RW7, RW2, RW10);
252 round_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW8, RW3, RW11);
253 round_0_63(RF, RG, RH, RA, RB, RC, RD, RE, RW11, RW9, RW4, RW12);
254 round_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW10, RW5, RW13);
255 round_0_63(RD, RE, RF, RG, RH, RA, RB, RC, RW13, RW11, RW6, RW14);
256 round_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW12, RW7, RW15);
257 round_0_63(RB, RC, RD, RE, RF, RG, RH, RA, RW15, RW13, RW8, RW0);
260 round_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW0);
261 round_64_79(RH, RA, RB, RC, RD, RE, RF, RG, RW1);
262 round_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW2);
263 round_64_79(RF, RG, RH, RA, RB, RC, RD, RE, RW3);
264 round_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4);
265 round_64_79(RD, RE, RF, RG, RH, RA, RB, RC, RW5);
266 round_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6);
267 round_64_79(RB, RC, RD, RE, RF, RG, RH, RA, RW7);
268 round_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8);
269 round_64_79(RH, RA, RB, RC, RD, RE, RF, RG, RW9);
270 round_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10);
271 round_64_79(RF, RG, RH, RA, RB, RC, RD, RE, RW11);
272 round_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12);
273 round_64_79(RD, RE, RF, RG, RH, RA, RB, RC, RW13);
274 round_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14);
275 round_64_79(RB, RC, RD, RE, RF, RG, RH, RA, RW15);
277 /* Load context to d16-d23 */
278 vld1.64 {RW0-RW3}, [%r0]!;
279 vld1.64 {RW4-RW7}, [%r0];
291 /* Store the first half of context */
292 vst1.64 {RA-RD}, [%r0]!;
294 /* Clear used registers */
296 veor.u64 RW01q, RW01q;
297 veor.u64 RW23q, RW23q;
298 veor.u64 RW45q, RW45q;
299 veor.u64 RW67q, RW67q;
300 vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
301 veor.u64 RW89q, RW89q;
302 veor.u64 RW1011q, RW1011q;
303 veor.u64 RW1213q, RW1213q;
304 veor.u64 RW1415q, RW1415q;
314 .size _gcry_sha512_transform_armv7_neon,.-_gcry_sha512_transform_armv7_neon;