Imported Upstream version 2.3.3
[platform/upstream/cryptsetup.git] / lib / crypto_backend / argon2 / blake2 / blamka-round-opt.h
1 /*
2  * Argon2 reference source code package - reference C implementations
3  *
4  * Copyright 2015
5  * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
6  *
7  * You may use this work under the terms of a Creative Commons CC0 1.0
8  * License/Waiver or the Apache Public License 2.0, at your option. The terms of
9  * these licenses can be found at:
10  *
11  * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
12  * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * You should have received a copy of both of these licenses along with this
15  * software. If not, they may be obtained at the above URLs.
16  */
17
18 #ifndef BLAKE_ROUND_MKA_OPT_H
19 #define BLAKE_ROUND_MKA_OPT_H
20
21 #include "blake2-impl.h"
22
23 #include <emmintrin.h>
24 #if defined(__SSSE3__)
25 #include <tmmintrin.h> /* for _mm_shuffle_epi8 and _mm_alignr_epi8 */
26 #endif
27
28 #if defined(__XOP__) && (defined(__GNUC__) || defined(__clang__))
29 #include <x86intrin.h>
30 #endif
31
32 #if !defined(__AVX512F__)
33 #if !defined(__AVX2__)
34 #if !defined(__XOP__)
35 #if defined(__SSSE3__)
36 #define r16                                                                    \
37     (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
38 #define r24                                                                    \
39     (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
40 #define _mm_roti_epi64(x, c)                                                   \
41     (-(c) == 32)                                                               \
42         ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1))                      \
43         : (-(c) == 24)                                                         \
44               ? _mm_shuffle_epi8((x), r24)                                     \
45               : (-(c) == 16)                                                   \
46                     ? _mm_shuffle_epi8((x), r16)                               \
47                     : (-(c) == 63)                                             \
48                           ? _mm_xor_si128(_mm_srli_epi64((x), -(c)),           \
49                                           _mm_add_epi64((x), (x)))             \
50                           : _mm_xor_si128(_mm_srli_epi64((x), -(c)),           \
51                                           _mm_slli_epi64((x), 64 - (-(c))))
52 #else /* defined(__SSE2__) */
53 #define _mm_roti_epi64(r, c)                                                   \
54     _mm_xor_si128(_mm_srli_epi64((r), -(c)), _mm_slli_epi64((r), 64 - (-(c))))
55 #endif
56 #else
57 #endif
58
59 static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
60     const __m128i z = _mm_mul_epu32(x, y);
61     return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z));
62 }
63
64 #define G1(A0, B0, C0, D0, A1, B1, C1, D1)                                     \
65     do {                                                                       \
66         A0 = fBlaMka(A0, B0);                                                  \
67         A1 = fBlaMka(A1, B1);                                                  \
68                                                                                \
69         D0 = _mm_xor_si128(D0, A0);                                            \
70         D1 = _mm_xor_si128(D1, A1);                                            \
71                                                                                \
72         D0 = _mm_roti_epi64(D0, -32);                                          \
73         D1 = _mm_roti_epi64(D1, -32);                                          \
74                                                                                \
75         C0 = fBlaMka(C0, D0);                                                  \
76         C1 = fBlaMka(C1, D1);                                                  \
77                                                                                \
78         B0 = _mm_xor_si128(B0, C0);                                            \
79         B1 = _mm_xor_si128(B1, C1);                                            \
80                                                                                \
81         B0 = _mm_roti_epi64(B0, -24);                                          \
82         B1 = _mm_roti_epi64(B1, -24);                                          \
83     } while ((void)0, 0)
84
85 #define G2(A0, B0, C0, D0, A1, B1, C1, D1)                                     \
86     do {                                                                       \
87         A0 = fBlaMka(A0, B0);                                                  \
88         A1 = fBlaMka(A1, B1);                                                  \
89                                                                                \
90         D0 = _mm_xor_si128(D0, A0);                                            \
91         D1 = _mm_xor_si128(D1, A1);                                            \
92                                                                                \
93         D0 = _mm_roti_epi64(D0, -16);                                          \
94         D1 = _mm_roti_epi64(D1, -16);                                          \
95                                                                                \
96         C0 = fBlaMka(C0, D0);                                                  \
97         C1 = fBlaMka(C1, D1);                                                  \
98                                                                                \
99         B0 = _mm_xor_si128(B0, C0);                                            \
100         B1 = _mm_xor_si128(B1, C1);                                            \
101                                                                                \
102         B0 = _mm_roti_epi64(B0, -63);                                          \
103         B1 = _mm_roti_epi64(B1, -63);                                          \
104     } while ((void)0, 0)
105
106 #if defined(__SSSE3__)
107 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
108     do {                                                                       \
109         __m128i t0 = _mm_alignr_epi8(B1, B0, 8);                               \
110         __m128i t1 = _mm_alignr_epi8(B0, B1, 8);                               \
111         B0 = t0;                                                               \
112         B1 = t1;                                                               \
113                                                                                \
114         t0 = C0;                                                               \
115         C0 = C1;                                                               \
116         C1 = t0;                                                               \
117                                                                                \
118         t0 = _mm_alignr_epi8(D1, D0, 8);                                       \
119         t1 = _mm_alignr_epi8(D0, D1, 8);                                       \
120         D0 = t1;                                                               \
121         D1 = t0;                                                               \
122     } while ((void)0, 0)
123
124 #define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
125     do {                                                                       \
126         __m128i t0 = _mm_alignr_epi8(B0, B1, 8);                               \
127         __m128i t1 = _mm_alignr_epi8(B1, B0, 8);                               \
128         B0 = t0;                                                               \
129         B1 = t1;                                                               \
130                                                                                \
131         t0 = C0;                                                               \
132         C0 = C1;                                                               \
133         C1 = t0;                                                               \
134                                                                                \
135         t0 = _mm_alignr_epi8(D0, D1, 8);                                       \
136         t1 = _mm_alignr_epi8(D1, D0, 8);                                       \
137         D0 = t1;                                                               \
138         D1 = t0;                                                               \
139     } while ((void)0, 0)
140 #else /* SSE2 */
141 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
142     do {                                                                       \
143         __m128i t0 = D0;                                                       \
144         __m128i t1 = B0;                                                       \
145         D0 = C0;                                                               \
146         C0 = C1;                                                               \
147         C1 = D0;                                                               \
148         D0 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t0, t0));               \
149         D1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(D1, D1));               \
150         B0 = _mm_unpackhi_epi64(B0, _mm_unpacklo_epi64(B1, B1));               \
151         B1 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(t1, t1));               \
152     } while ((void)0, 0)
153
154 #define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
155     do {                                                                       \
156         __m128i t0, t1;                                                        \
157         t0 = C0;                                                               \
158         C0 = C1;                                                               \
159         C1 = t0;                                                               \
160         t0 = B0;                                                               \
161         t1 = D0;                                                               \
162         B0 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(B0, B0));               \
163         B1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(B1, B1));               \
164         D0 = _mm_unpackhi_epi64(D0, _mm_unpacklo_epi64(D1, D1));               \
165         D1 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t1, t1));               \
166     } while ((void)0, 0)
167 #endif
168
169 #define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1)                           \
170     do {                                                                       \
171         G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
172         G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
173                                                                                \
174         DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                           \
175                                                                                \
176         G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
177         G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
178                                                                                \
179         UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                         \
180     } while ((void)0, 0)
181 #else /* __AVX2__ */
182
183 #include <immintrin.h>
184
185 #define rotr32(x)   _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
186 #define rotr24(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
187 #define rotr16(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
188 #define rotr63(x)   _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
189
190 #define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
191     do { \
192         __m256i ml = _mm256_mul_epu32(A0, B0); \
193         ml = _mm256_add_epi64(ml, ml); \
194         A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
195         D0 = _mm256_xor_si256(D0, A0); \
196         D0 = rotr32(D0); \
197         \
198         ml = _mm256_mul_epu32(C0, D0); \
199         ml = _mm256_add_epi64(ml, ml); \
200         C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
201         \
202         B0 = _mm256_xor_si256(B0, C0); \
203         B0 = rotr24(B0); \
204         \
205         ml = _mm256_mul_epu32(A1, B1); \
206         ml = _mm256_add_epi64(ml, ml); \
207         A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
208         D1 = _mm256_xor_si256(D1, A1); \
209         D1 = rotr32(D1); \
210         \
211         ml = _mm256_mul_epu32(C1, D1); \
212         ml = _mm256_add_epi64(ml, ml); \
213         C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
214         \
215         B1 = _mm256_xor_si256(B1, C1); \
216         B1 = rotr24(B1); \
217     } while((void)0, 0);
218
219 #define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
220     do { \
221         __m256i ml = _mm256_mul_epu32(A0, B0); \
222         ml = _mm256_add_epi64(ml, ml); \
223         A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
224         D0 = _mm256_xor_si256(D0, A0); \
225         D0 = rotr16(D0); \
226         \
227         ml = _mm256_mul_epu32(C0, D0); \
228         ml = _mm256_add_epi64(ml, ml); \
229         C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
230         B0 = _mm256_xor_si256(B0, C0); \
231         B0 = rotr63(B0); \
232         \
233         ml = _mm256_mul_epu32(A1, B1); \
234         ml = _mm256_add_epi64(ml, ml); \
235         A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
236         D1 = _mm256_xor_si256(D1, A1); \
237         D1 = rotr16(D1); \
238         \
239         ml = _mm256_mul_epu32(C1, D1); \
240         ml = _mm256_add_epi64(ml, ml); \
241         C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
242         B1 = _mm256_xor_si256(B1, C1); \
243         B1 = rotr63(B1); \
244     } while((void)0, 0);
245
246 #define DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
247     do { \
248         B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
249         C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
250         D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
251         \
252         B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
253         C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
254         D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
255     } while((void)0, 0);
256
257 #define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
258     do { \
259         __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
260         __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
261         B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
262         B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
263         \
264         tmp1 = C0; \
265         C0 = C1; \
266         C1 = tmp1; \
267         \
268         tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
269         tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
270         D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
271         D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
272     } while(0);
273
274 #define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
275     do { \
276         B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
277         C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
278         D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
279         \
280         B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
281         C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
282         D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
283     } while((void)0, 0);
284
285 #define UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
286     do { \
287         __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
288         __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
289         B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
290         B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
291         \
292         tmp1 = C0; \
293         C0 = C1; \
294         C1 = tmp1; \
295         \
296         tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
297         tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
298         D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
299         D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
300     } while((void)0, 0);
301
302 #define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
303     do{ \
304         G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
305         G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
306         \
307         DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
308         \
309         G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
310         G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
311         \
312         UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
313     } while((void)0, 0);
314
315 #define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
316     do{ \
317         G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
318         G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
319         \
320         DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
321         \
322         G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
323         G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
324         \
325         UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
326     } while((void)0, 0);
327
328 #endif /* __AVX2__ */
329
330 #else /* __AVX512F__ */
331
332 #include <immintrin.h>
333
334 #define ror64(x, n) _mm512_ror_epi64((x), (n))
335
336 static __m512i muladd(__m512i x, __m512i y)
337 {
338     __m512i z = _mm512_mul_epu32(x, y);
339     return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
340 }
341
342 #define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
343     do { \
344         A0 = muladd(A0, B0); \
345         A1 = muladd(A1, B1); \
346 \
347         D0 = _mm512_xor_si512(D0, A0); \
348         D1 = _mm512_xor_si512(D1, A1); \
349 \
350         D0 = ror64(D0, 32); \
351         D1 = ror64(D1, 32); \
352 \
353         C0 = muladd(C0, D0); \
354         C1 = muladd(C1, D1); \
355 \
356         B0 = _mm512_xor_si512(B0, C0); \
357         B1 = _mm512_xor_si512(B1, C1); \
358 \
359         B0 = ror64(B0, 24); \
360         B1 = ror64(B1, 24); \
361     } while ((void)0, 0)
362
363 #define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
364     do { \
365         A0 = muladd(A0, B0); \
366         A1 = muladd(A1, B1); \
367 \
368         D0 = _mm512_xor_si512(D0, A0); \
369         D1 = _mm512_xor_si512(D1, A1); \
370 \
371         D0 = ror64(D0, 16); \
372         D1 = ror64(D1, 16); \
373 \
374         C0 = muladd(C0, D0); \
375         C1 = muladd(C1, D1); \
376 \
377         B0 = _mm512_xor_si512(B0, C0); \
378         B1 = _mm512_xor_si512(B1, C1); \
379 \
380         B0 = ror64(B0, 63); \
381         B1 = ror64(B1, 63); \
382     } while ((void)0, 0)
383
384 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
385     do { \
386         B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
387         B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
388 \
389         C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
390         C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
391 \
392         D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
393         D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
394     } while ((void)0, 0)
395
396 #define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
397     do { \
398         B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
399         B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
400 \
401         C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
402         C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
403 \
404         D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
405         D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
406     } while ((void)0, 0)
407
408 #define BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1) \
409     do { \
410         G1(A0, B0, C0, D0, A1, B1, C1, D1); \
411         G2(A0, B0, C0, D0, A1, B1, C1, D1); \
412 \
413         DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
414 \
415         G1(A0, B0, C0, D0, A1, B1, C1, D1); \
416         G2(A0, B0, C0, D0, A1, B1, C1, D1); \
417 \
418         UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
419     } while ((void)0, 0)
420
421 #define SWAP_HALVES(A0, A1) \
422     do { \
423         __m512i t0, t1; \
424         t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
425         t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
426         A0 = t0; \
427         A1 = t1; \
428     } while((void)0, 0)
429
430 #define SWAP_QUARTERS(A0, A1) \
431     do { \
432         SWAP_HALVES(A0, A1); \
433         A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
434         A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
435     } while((void)0, 0)
436
437 #define UNSWAP_QUARTERS(A0, A1) \
438     do { \
439         A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
440         A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
441         SWAP_HALVES(A0, A1); \
442     } while((void)0, 0)
443
444 #define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \
445     do { \
446         SWAP_HALVES(A0, B0); \
447         SWAP_HALVES(C0, D0); \
448         SWAP_HALVES(A1, B1); \
449         SWAP_HALVES(C1, D1); \
450         BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \
451         SWAP_HALVES(A0, B0); \
452         SWAP_HALVES(C0, D0); \
453         SWAP_HALVES(A1, B1); \
454         SWAP_HALVES(C1, D1); \
455     } while ((void)0, 0)
456
457 #define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
458     do { \
459         SWAP_QUARTERS(A0, A1); \
460         SWAP_QUARTERS(B0, B1); \
461         SWAP_QUARTERS(C0, C1); \
462         SWAP_QUARTERS(D0, D1); \
463         BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \
464         UNSWAP_QUARTERS(A0, A1); \
465         UNSWAP_QUARTERS(B0, B1); \
466         UNSWAP_QUARTERS(C0, C1); \
467         UNSWAP_QUARTERS(D0, D1); \
468     } while ((void)0, 0)
469
470 #endif /* __AVX512F__ */
471 #endif /* BLAKE_ROUND_MKA_OPT_H */