7c1abc513f34621eff4d0ee72db6a6c3dbb291f5
[platform/kernel/linux-starfive.git] / arch / x86 / crypto / aria-aesni-avx-asm_64.S
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * ARIA Cipher 16-way parallel algorithm (AVX)
4  *
5  * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
6  *
7  */
8
9 #include <linux/linkage.h>
10 #include <linux/cfi_types.h>
11 #include <asm/asm-offsets.h>
12 #include <asm/frame.h>
13
14 /* register macros */
15 #define CTX %rdi
16
17
18 #define BV8(a0, a1, a2, a3, a4, a5, a6, a7)             \
19         ( (((a0) & 1) << 0) |                           \
20           (((a1) & 1) << 1) |                           \
21           (((a2) & 1) << 2) |                           \
22           (((a3) & 1) << 3) |                           \
23           (((a4) & 1) << 4) |                           \
24           (((a5) & 1) << 5) |                           \
25           (((a6) & 1) << 6) |                           \
26           (((a7) & 1) << 7) )
27
28 #define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)           \
29         ( ((l7) << (0 * 8)) |                           \
30           ((l6) << (1 * 8)) |                           \
31           ((l5) << (2 * 8)) |                           \
32           ((l4) << (3 * 8)) |                           \
33           ((l3) << (4 * 8)) |                           \
34           ((l2) << (5 * 8)) |                           \
35           ((l1) << (6 * 8)) |                           \
36           ((l0) << (7 * 8)) )
37
38 #define inc_le128(x, minus_one, tmp)                    \
39         vpcmpeqq minus_one, x, tmp;                     \
40         vpsubq minus_one, x, x;                         \
41         vpslldq $8, tmp, tmp;                           \
42         vpsubq tmp, x, x;
43
44 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)      \
45         vpand x, mask4bit, tmp0;                        \
46         vpandn x, mask4bit, x;                          \
47         vpsrld $4, x, x;                                \
48                                                         \
49         vpshufb tmp0, lo_t, tmp0;                       \
50         vpshufb x, hi_t, x;                             \
51         vpxor tmp0, x, x;
52
53 #define transpose_4x4(x0, x1, x2, x3, t1, t2)           \
54         vpunpckhdq x1, x0, t2;                          \
55         vpunpckldq x1, x0, x0;                          \
56                                                         \
57         vpunpckldq x3, x2, t1;                          \
58         vpunpckhdq x3, x2, x2;                          \
59                                                         \
60         vpunpckhqdq t1, x0, x1;                         \
61         vpunpcklqdq t1, x0, x0;                         \
62                                                         \
63         vpunpckhqdq x2, t2, x3;                         \
64         vpunpcklqdq x2, t2, x2;
65
66 #define byteslice_16x16b(a0, b0, c0, d0,                \
67                          a1, b1, c1, d1,                \
68                          a2, b2, c2, d2,                \
69                          a3, b3, c3, d3,                \
70                          st0, st1)                      \
71         vmovdqu d2, st0;                                \
72         vmovdqu d3, st1;                                \
73         transpose_4x4(a0, a1, a2, a3, d2, d3);          \
74         transpose_4x4(b0, b1, b2, b3, d2, d3);          \
75         vmovdqu st0, d2;                                \
76         vmovdqu st1, d3;                                \
77                                                         \
78         vmovdqu a0, st0;                                \
79         vmovdqu a1, st1;                                \
80         transpose_4x4(c0, c1, c2, c3, a0, a1);          \
81         transpose_4x4(d0, d1, d2, d3, a0, a1);          \
82                                                         \
83         vmovdqu .Lshufb_16x16b(%rip), a0;               \
84         vmovdqu st1, a1;                                \
85         vpshufb a0, a2, a2;                             \
86         vpshufb a0, a3, a3;                             \
87         vpshufb a0, b0, b0;                             \
88         vpshufb a0, b1, b1;                             \
89         vpshufb a0, b2, b2;                             \
90         vpshufb a0, b3, b3;                             \
91         vpshufb a0, a1, a1;                             \
92         vpshufb a0, c0, c0;                             \
93         vpshufb a0, c1, c1;                             \
94         vpshufb a0, c2, c2;                             \
95         vpshufb a0, c3, c3;                             \
96         vpshufb a0, d0, d0;                             \
97         vpshufb a0, d1, d1;                             \
98         vpshufb a0, d2, d2;                             \
99         vpshufb a0, d3, d3;                             \
100         vmovdqu d3, st1;                                \
101         vmovdqu st0, d3;                                \
102         vpshufb a0, d3, a0;                             \
103         vmovdqu d2, st0;                                \
104                                                         \
105         transpose_4x4(a0, b0, c0, d0, d2, d3);          \
106         transpose_4x4(a1, b1, c1, d1, d2, d3);          \
107         vmovdqu st0, d2;                                \
108         vmovdqu st1, d3;                                \
109                                                         \
110         vmovdqu b0, st0;                                \
111         vmovdqu b1, st1;                                \
112         transpose_4x4(a2, b2, c2, d2, b0, b1);          \
113         transpose_4x4(a3, b3, c3, d3, b0, b1);          \
114         vmovdqu st0, b0;                                \
115         vmovdqu st1, b1;                                \
116         /* does not adjust output bytes inside vectors */
117
118 #define debyteslice_16x16b(a0, b0, c0, d0,              \
119                            a1, b1, c1, d1,              \
120                            a2, b2, c2, d2,              \
121                            a3, b3, c3, d3,              \
122                            st0, st1)                    \
123         vmovdqu d2, st0;                                \
124         vmovdqu d3, st1;                                \
125         transpose_4x4(a0, a1, a2, a3, d2, d3);          \
126         transpose_4x4(b0, b1, b2, b3, d2, d3);          \
127         vmovdqu st0, d2;                                \
128         vmovdqu st1, d3;                                \
129                                                         \
130         vmovdqu a0, st0;                                \
131         vmovdqu a1, st1;                                \
132         transpose_4x4(c0, c1, c2, c3, a0, a1);          \
133         transpose_4x4(d0, d1, d2, d3, a0, a1);          \
134                                                         \
135         vmovdqu .Lshufb_16x16b(%rip), a0;               \
136         vmovdqu st1, a1;                                \
137         vpshufb a0, a2, a2;                             \
138         vpshufb a0, a3, a3;                             \
139         vpshufb a0, b0, b0;                             \
140         vpshufb a0, b1, b1;                             \
141         vpshufb a0, b2, b2;                             \
142         vpshufb a0, b3, b3;                             \
143         vpshufb a0, a1, a1;                             \
144         vpshufb a0, c0, c0;                             \
145         vpshufb a0, c1, c1;                             \
146         vpshufb a0, c2, c2;                             \
147         vpshufb a0, c3, c3;                             \
148         vpshufb a0, d0, d0;                             \
149         vpshufb a0, d1, d1;                             \
150         vpshufb a0, d2, d2;                             \
151         vpshufb a0, d3, d3;                             \
152         vmovdqu d3, st1;                                \
153         vmovdqu st0, d3;                                \
154         vpshufb a0, d3, a0;                             \
155         vmovdqu d2, st0;                                \
156                                                         \
157         transpose_4x4(c0, d0, a0, b0, d2, d3);          \
158         transpose_4x4(c1, d1, a1, b1, d2, d3);          \
159         vmovdqu st0, d2;                                \
160         vmovdqu st1, d3;                                \
161                                                         \
162         vmovdqu b0, st0;                                \
163         vmovdqu b1, st1;                                \
164         transpose_4x4(c2, d2, a2, b2, b0, b1);          \
165         transpose_4x4(c3, d3, a3, b3, b0, b1);          \
166         vmovdqu st0, b0;                                \
167         vmovdqu st1, b1;                                \
168         /* does not adjust output bytes inside vectors */
169
170 /* load blocks to registers and apply pre-whitening */
171 #define inpack16_pre(x0, x1, x2, x3,                    \
172                      x4, x5, x6, x7,                    \
173                      y0, y1, y2, y3,                    \
174                      y4, y5, y6, y7,                    \
175                      rio)                               \
176         vmovdqu (0 * 16)(rio), x0;                      \
177         vmovdqu (1 * 16)(rio), x1;                      \
178         vmovdqu (2 * 16)(rio), x2;                      \
179         vmovdqu (3 * 16)(rio), x3;                      \
180         vmovdqu (4 * 16)(rio), x4;                      \
181         vmovdqu (5 * 16)(rio), x5;                      \
182         vmovdqu (6 * 16)(rio), x6;                      \
183         vmovdqu (7 * 16)(rio), x7;                      \
184         vmovdqu (8 * 16)(rio), y0;                      \
185         vmovdqu (9 * 16)(rio), y1;                      \
186         vmovdqu (10 * 16)(rio), y2;                     \
187         vmovdqu (11 * 16)(rio), y3;                     \
188         vmovdqu (12 * 16)(rio), y4;                     \
189         vmovdqu (13 * 16)(rio), y5;                     \
190         vmovdqu (14 * 16)(rio), y6;                     \
191         vmovdqu (15 * 16)(rio), y7;
192
193 /* byteslice pre-whitened blocks and store to temporary memory */
194 #define inpack16_post(x0, x1, x2, x3,                   \
195                       x4, x5, x6, x7,                   \
196                       y0, y1, y2, y3,                   \
197                       y4, y5, y6, y7,                   \
198                       mem_ab, mem_cd)                   \
199         byteslice_16x16b(x0, x1, x2, x3,                \
200                          x4, x5, x6, x7,                \
201                          y0, y1, y2, y3,                \
202                          y4, y5, y6, y7,                \
203                          (mem_ab), (mem_cd));           \
204                                                         \
205         vmovdqu x0, 0 * 16(mem_ab);                     \
206         vmovdqu x1, 1 * 16(mem_ab);                     \
207         vmovdqu x2, 2 * 16(mem_ab);                     \
208         vmovdqu x3, 3 * 16(mem_ab);                     \
209         vmovdqu x4, 4 * 16(mem_ab);                     \
210         vmovdqu x5, 5 * 16(mem_ab);                     \
211         vmovdqu x6, 6 * 16(mem_ab);                     \
212         vmovdqu x7, 7 * 16(mem_ab);                     \
213         vmovdqu y0, 0 * 16(mem_cd);                     \
214         vmovdqu y1, 1 * 16(mem_cd);                     \
215         vmovdqu y2, 2 * 16(mem_cd);                     \
216         vmovdqu y3, 3 * 16(mem_cd);                     \
217         vmovdqu y4, 4 * 16(mem_cd);                     \
218         vmovdqu y5, 5 * 16(mem_cd);                     \
219         vmovdqu y6, 6 * 16(mem_cd);                     \
220         vmovdqu y7, 7 * 16(mem_cd);
221
222 #define write_output(x0, x1, x2, x3,                    \
223                      x4, x5, x6, x7,                    \
224                      y0, y1, y2, y3,                    \
225                      y4, y5, y6, y7,                    \
226                      mem)                               \
227         vmovdqu x0, 0 * 16(mem);                        \
228         vmovdqu x1, 1 * 16(mem);                        \
229         vmovdqu x2, 2 * 16(mem);                        \
230         vmovdqu x3, 3 * 16(mem);                        \
231         vmovdqu x4, 4 * 16(mem);                        \
232         vmovdqu x5, 5 * 16(mem);                        \
233         vmovdqu x6, 6 * 16(mem);                        \
234         vmovdqu x7, 7 * 16(mem);                        \
235         vmovdqu y0, 8 * 16(mem);                        \
236         vmovdqu y1, 9 * 16(mem);                        \
237         vmovdqu y2, 10 * 16(mem);                       \
238         vmovdqu y3, 11 * 16(mem);                       \
239         vmovdqu y4, 12 * 16(mem);                       \
240         vmovdqu y5, 13 * 16(mem);                       \
241         vmovdqu y6, 14 * 16(mem);                       \
242         vmovdqu y7, 15 * 16(mem);                       \
243
244 #define aria_store_state_8way(x0, x1, x2, x3,           \
245                               x4, x5, x6, x7,           \
246                               mem_tmp, idx)             \
247         vmovdqu x0, ((idx + 0) * 16)(mem_tmp);          \
248         vmovdqu x1, ((idx + 1) * 16)(mem_tmp);          \
249         vmovdqu x2, ((idx + 2) * 16)(mem_tmp);          \
250         vmovdqu x3, ((idx + 3) * 16)(mem_tmp);          \
251         vmovdqu x4, ((idx + 4) * 16)(mem_tmp);          \
252         vmovdqu x5, ((idx + 5) * 16)(mem_tmp);          \
253         vmovdqu x6, ((idx + 6) * 16)(mem_tmp);          \
254         vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
255
256 #define aria_load_state_8way(x0, x1, x2, x3,            \
257                              x4, x5, x6, x7,            \
258                              mem_tmp, idx)              \
259         vmovdqu ((idx + 0) * 16)(mem_tmp), x0;          \
260         vmovdqu ((idx + 1) * 16)(mem_tmp), x1;          \
261         vmovdqu ((idx + 2) * 16)(mem_tmp), x2;          \
262         vmovdqu ((idx + 3) * 16)(mem_tmp), x3;          \
263         vmovdqu ((idx + 4) * 16)(mem_tmp), x4;          \
264         vmovdqu ((idx + 5) * 16)(mem_tmp), x5;          \
265         vmovdqu ((idx + 6) * 16)(mem_tmp), x6;          \
266         vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
267
268 #define aria_ark_8way(x0, x1, x2, x3,                   \
269                       x4, x5, x6, x7,                   \
270                       t0, t1, t2, rk,                   \
271                       idx, round)                       \
272         /* AddRoundKey */                               \
273         vbroadcastss ((round * 16) + idx + 0)(rk), t0;  \
274         vpsrld $24, t0, t2;                             \
275         vpshufb t1, t2, t2;                             \
276         vpxor t2, x0, x0;                               \
277         vpsrld $16, t0, t2;                             \
278         vpshufb t1, t2, t2;                             \
279         vpxor t2, x1, x1;                               \
280         vpsrld $8, t0, t2;                              \
281         vpshufb t1, t2, t2;                             \
282         vpxor t2, x2, x2;                               \
283         vpshufb t1, t0, t2;                             \
284         vpxor t2, x3, x3;                               \
285         vbroadcastss ((round * 16) + idx + 4)(rk), t0;  \
286         vpsrld $24, t0, t2;                             \
287         vpshufb t1, t2, t2;                             \
288         vpxor t2, x4, x4;                               \
289         vpsrld $16, t0, t2;                             \
290         vpshufb t1, t2, t2;                             \
291         vpxor t2, x5, x5;                               \
292         vpsrld $8, t0, t2;                              \
293         vpshufb t1, t2, t2;                             \
294         vpxor t2, x6, x6;                               \
295         vpshufb t1, t0, t2;                             \
296         vpxor t2, x7, x7;
297
298 #ifdef CONFIG_AS_GFNI
299 #define aria_sbox_8way_gfni(x0, x1, x2, x3,             \
300                             x4, x5, x6, x7,             \
301                             t0, t1, t2, t3,             \
302                             t4, t5, t6, t7)             \
303         vmovdqa .Ltf_s2_bitmatrix(%rip), t0;            \
304         vmovdqa .Ltf_inv_bitmatrix(%rip), t1;           \
305         vmovdqa .Ltf_id_bitmatrix(%rip), t2;            \
306         vmovdqa .Ltf_aff_bitmatrix(%rip), t3;           \
307         vmovdqa .Ltf_x2_bitmatrix(%rip), t4;            \
308         vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;   \
309         vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;   \
310         vgf2p8affineqb $(tf_inv_const), t1, x2, x2;     \
311         vgf2p8affineqb $(tf_inv_const), t1, x6, x6;     \
312         vgf2p8affineinvqb $0, t2, x2, x2;               \
313         vgf2p8affineinvqb $0, t2, x6, x6;               \
314         vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;  \
315         vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;  \
316         vgf2p8affineqb $(tf_x2_const), t4, x3, x3;      \
317         vgf2p8affineqb $(tf_x2_const), t4, x7, x7;      \
318         vgf2p8affineinvqb $0, t2, x3, x3;               \
319         vgf2p8affineinvqb $0, t2, x7, x7
320
321 #endif /* CONFIG_AS_GFNI */
322
323 #define aria_sbox_8way(x0, x1, x2, x3,                  \
324                        x4, x5, x6, x7,                  \
325                        t0, t1, t2, t3,                  \
326                        t4, t5, t6, t7)                  \
327         vmovdqa .Linv_shift_row(%rip), t0;              \
328         vmovdqa .Lshift_row(%rip), t1;                  \
329         vbroadcastss .L0f0f0f0f(%rip), t6;              \
330         vmovdqa .Ltf_lo__inv_aff__and__s2(%rip), t2;    \
331         vmovdqa .Ltf_hi__inv_aff__and__s2(%rip), t3;    \
332         vmovdqa .Ltf_lo__x2__and__fwd_aff(%rip), t4;    \
333         vmovdqa .Ltf_hi__x2__and__fwd_aff(%rip), t5;    \
334                                                         \
335         vaesenclast t7, x0, x0;                         \
336         vaesenclast t7, x4, x4;                         \
337         vaesenclast t7, x1, x1;                         \
338         vaesenclast t7, x5, x5;                         \
339         vaesdeclast t7, x2, x2;                         \
340         vaesdeclast t7, x6, x6;                         \
341                                                         \
342         /* AES inverse shift rows */                    \
343         vpshufb t0, x0, x0;                             \
344         vpshufb t0, x4, x4;                             \
345         vpshufb t0, x1, x1;                             \
346         vpshufb t0, x5, x5;                             \
347         vpshufb t1, x3, x3;                             \
348         vpshufb t1, x7, x7;                             \
349         vpshufb t1, x2, x2;                             \
350         vpshufb t1, x6, x6;                             \
351                                                         \
352         /* affine transformation for S2 */              \
353         filter_8bit(x1, t2, t3, t6, t0);                \
354         /* affine transformation for S2 */              \
355         filter_8bit(x5, t2, t3, t6, t0);                \
356                                                         \
357         /* affine transformation for X2 */              \
358         filter_8bit(x3, t4, t5, t6, t0);                \
359         /* affine transformation for X2 */              \
360         filter_8bit(x7, t4, t5, t6, t0);                \
361         vaesdeclast t7, x3, x3;                         \
362         vaesdeclast t7, x7, x7;
363
364 #define aria_diff_m(x0, x1, x2, x3,                     \
365                     t0, t1, t2, t3)                     \
366         /* T = rotr32(X, 8); */                         \
367         /* X ^= T */                                    \
368         vpxor x0, x3, t0;                               \
369         vpxor x1, x0, t1;                               \
370         vpxor x2, x1, t2;                               \
371         vpxor x3, x2, t3;                               \
372         /* X = T ^ rotr(X, 16); */                      \
373         vpxor t2, x0, x0;                               \
374         vpxor x1, t3, t3;                               \
375         vpxor t0, x2, x2;                               \
376         vpxor t1, x3, x1;                               \
377         vmovdqu t3, x3;
378
379 #define aria_diff_word(x0, x1, x2, x3,                  \
380                        x4, x5, x6, x7,                  \
381                        y0, y1, y2, y3,                  \
382                        y4, y5, y6, y7)                  \
383         /* t1 ^= t2; */                                 \
384         vpxor y0, x4, x4;                               \
385         vpxor y1, x5, x5;                               \
386         vpxor y2, x6, x6;                               \
387         vpxor y3, x7, x7;                               \
388                                                         \
389         /* t2 ^= t3; */                                 \
390         vpxor y4, y0, y0;                               \
391         vpxor y5, y1, y1;                               \
392         vpxor y6, y2, y2;                               \
393         vpxor y7, y3, y3;                               \
394                                                         \
395         /* t0 ^= t1; */                                 \
396         vpxor x4, x0, x0;                               \
397         vpxor x5, x1, x1;                               \
398         vpxor x6, x2, x2;                               \
399         vpxor x7, x3, x3;                               \
400                                                         \
401         /* t3 ^= t1; */                                 \
402         vpxor x4, y4, y4;                               \
403         vpxor x5, y5, y5;                               \
404         vpxor x6, y6, y6;                               \
405         vpxor x7, y7, y7;                               \
406                                                         \
407         /* t2 ^= t0; */                                 \
408         vpxor x0, y0, y0;                               \
409         vpxor x1, y1, y1;                               \
410         vpxor x2, y2, y2;                               \
411         vpxor x3, y3, y3;                               \
412                                                         \
413         /* t1 ^= t2; */                                 \
414         vpxor y0, x4, x4;                               \
415         vpxor y1, x5, x5;                               \
416         vpxor y2, x6, x6;                               \
417         vpxor y3, x7, x7;
418
419 #define aria_fe(x0, x1, x2, x3,                         \
420                 x4, x5, x6, x7,                         \
421                 y0, y1, y2, y3,                         \
422                 y4, y5, y6, y7,                         \
423                 mem_tmp, rk, round)                     \
424         vpxor y7, y7, y7;                               \
425         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
426                       y0, y7, y2, rk, 8, round);        \
427                                                         \
428         aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
429                        y0, y1, y2, y3, y4, y5, y6, y7); \
430                                                         \
431         aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
432         aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
433         aria_store_state_8way(x0, x1, x2, x3,           \
434                               x4, x5, x6, x7,           \
435                               mem_tmp, 8);              \
436                                                         \
437         aria_load_state_8way(x0, x1, x2, x3,            \
438                              x4, x5, x6, x7,            \
439                              mem_tmp, 0);               \
440         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
441                       y0, y7, y2, rk, 0, round);        \
442                                                         \
443         aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
444                        y0, y1, y2, y3, y4, y5, y6, y7); \
445                                                         \
446         aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
447         aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
448         aria_store_state_8way(x0, x1, x2, x3,           \
449                               x4, x5, x6, x7,           \
450                               mem_tmp, 0);              \
451         aria_load_state_8way(y0, y1, y2, y3,            \
452                              y4, y5, y6, y7,            \
453                              mem_tmp, 8);               \
454         aria_diff_word(x0, x1, x2, x3,                  \
455                        x4, x5, x6, x7,                  \
456                        y0, y1, y2, y3,                  \
457                        y4, y5, y6, y7);                 \
458         /* aria_diff_byte()                             \
459          * T3 = ABCD -> BADC                            \
460          * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6        \
461          * T0 = ABCD -> CDAB                            \
462          * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1        \
463          * T1 = ABCD -> DCBA                            \
464          * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4        \
465          */                                             \
466         aria_diff_word(x2, x3, x0, x1,                  \
467                        x7, x6, x5, x4,                  \
468                        y0, y1, y2, y3,                  \
469                        y5, y4, y7, y6);                 \
470         aria_store_state_8way(x3, x2, x1, x0,           \
471                               x6, x7, x4, x5,           \
472                               mem_tmp, 0);
473
474 #define aria_fo(x0, x1, x2, x3,                         \
475                 x4, x5, x6, x7,                         \
476                 y0, y1, y2, y3,                         \
477                 y4, y5, y6, y7,                         \
478                 mem_tmp, rk, round)                     \
479         vpxor y7, y7, y7;                               \
480         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
481                       y0, y7, y2, rk, 8, round);        \
482                                                         \
483         aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,  \
484                        y0, y1, y2, y3, y4, y5, y6, y7); \
485                                                         \
486         aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
487         aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
488         aria_store_state_8way(x0, x1, x2, x3,           \
489                               x4, x5, x6, x7,           \
490                               mem_tmp, 8);              \
491                                                         \
492         aria_load_state_8way(x0, x1, x2, x3,            \
493                              x4, x5, x6, x7,            \
494                              mem_tmp, 0);               \
495         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
496                       y0, y7, y2, rk, 0, round);        \
497                                                         \
498         aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,  \
499                        y0, y1, y2, y3, y4, y5, y6, y7); \
500                                                         \
501         aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
502         aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
503         aria_store_state_8way(x0, x1, x2, x3,           \
504                               x4, x5, x6, x7,           \
505                               mem_tmp, 0);              \
506         aria_load_state_8way(y0, y1, y2, y3,            \
507                              y4, y5, y6, y7,            \
508                              mem_tmp, 8);               \
509         aria_diff_word(x0, x1, x2, x3,                  \
510                        x4, x5, x6, x7,                  \
511                        y0, y1, y2, y3,                  \
512                        y4, y5, y6, y7);                 \
513         /* aria_diff_byte()                             \
514          * T1 = ABCD -> BADC                            \
515          * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6        \
516          * T2 = ABCD -> CDAB                            \
517          * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1       \
518          * T3 = ABCD -> DCBA                            \
519          * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4        \
520          */                                             \
521         aria_diff_word(x0, x1, x2, x3,                  \
522                        x5, x4, x7, x6,                  \
523                        y2, y3, y0, y1,                  \
524                        y7, y6, y5, y4);                 \
525         aria_store_state_8way(x3, x2, x1, x0,           \
526                               x6, x7, x4, x5,           \
527                               mem_tmp, 0);
528
529 #define aria_ff(x0, x1, x2, x3,                         \
530                 x4, x5, x6, x7,                         \
531                 y0, y1, y2, y3,                         \
532                 y4, y5, y6, y7,                         \
533                 mem_tmp, rk, round, last_round)         \
534         vpxor y7, y7, y7;                               \
535         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
536                       y0, y7, y2, rk, 8, round);        \
537                                                         \
538         aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
539                        y0, y1, y2, y3, y4, y5, y6, y7); \
540                                                         \
541         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
542                       y0, y7, y2, rk, 8, last_round);   \
543                                                         \
544         aria_store_state_8way(x0, x1, x2, x3,           \
545                               x4, x5, x6, x7,           \
546                               mem_tmp, 8);              \
547                                                         \
548         aria_load_state_8way(x0, x1, x2, x3,            \
549                              x4, x5, x6, x7,            \
550                              mem_tmp, 0);               \
551         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
552                       y0, y7, y2, rk, 0, round);        \
553                                                         \
554         aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
555                        y0, y1, y2, y3, y4, y5, y6, y7); \
556                                                         \
557         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
558                       y0, y7, y2, rk, 0, last_round);   \
559                                                         \
560         aria_load_state_8way(y0, y1, y2, y3,            \
561                              y4, y5, y6, y7,            \
562                              mem_tmp, 8);
563
564 #ifdef CONFIG_AS_GFNI
565 #define aria_fe_gfni(x0, x1, x2, x3,                    \
566                      x4, x5, x6, x7,                    \
567                      y0, y1, y2, y3,                    \
568                      y4, y5, y6, y7,                    \
569                      mem_tmp, rk, round)                \
570         vpxor y7, y7, y7;                               \
571         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
572                       y0, y7, y2, rk, 8, round);        \
573                                                         \
574         aria_sbox_8way_gfni(x2, x3, x0, x1,             \
575                             x6, x7, x4, x5,             \
576                             y0, y1, y2, y3,             \
577                             y4, y5, y6, y7);            \
578                                                         \
579         aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
580         aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
581         aria_store_state_8way(x0, x1, x2, x3,           \
582                               x4, x5, x6, x7,           \
583                               mem_tmp, 8);              \
584                                                         \
585         aria_load_state_8way(x0, x1, x2, x3,            \
586                              x4, x5, x6, x7,            \
587                              mem_tmp, 0);               \
588         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
589                       y0, y7, y2, rk, 0, round);        \
590                                                         \
591         aria_sbox_8way_gfni(x2, x3, x0, x1,             \
592                             x6, x7, x4, x5,             \
593                             y0, y1, y2, y3,             \
594                             y4, y5, y6, y7);            \
595                                                         \
596         aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
597         aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
598         aria_store_state_8way(x0, x1, x2, x3,           \
599                               x4, x5, x6, x7,           \
600                               mem_tmp, 0);              \
601         aria_load_state_8way(y0, y1, y2, y3,            \
602                              y4, y5, y6, y7,            \
603                              mem_tmp, 8);               \
604         aria_diff_word(x0, x1, x2, x3,                  \
605                        x4, x5, x6, x7,                  \
606                        y0, y1, y2, y3,                  \
607                        y4, y5, y6, y7);                 \
608         /* aria_diff_byte()                             \
609          * T3 = ABCD -> BADC                            \
610          * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6        \
611          * T0 = ABCD -> CDAB                            \
612          * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1        \
613          * T1 = ABCD -> DCBA                            \
614          * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4        \
615          */                                             \
616         aria_diff_word(x2, x3, x0, x1,                  \
617                        x7, x6, x5, x4,                  \
618                        y0, y1, y2, y3,                  \
619                        y5, y4, y7, y6);                 \
620         aria_store_state_8way(x3, x2, x1, x0,           \
621                               x6, x7, x4, x5,           \
622                               mem_tmp, 0);
623
624 #define aria_fo_gfni(x0, x1, x2, x3,                    \
625                      x4, x5, x6, x7,                    \
626                      y0, y1, y2, y3,                    \
627                      y4, y5, y6, y7,                    \
628                      mem_tmp, rk, round)                \
629         vpxor y7, y7, y7;                               \
630         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
631                       y0, y7, y2, rk, 8, round);        \
632                                                         \
633         aria_sbox_8way_gfni(x0, x1, x2, x3,             \
634                             x4, x5, x6, x7,             \
635                             y0, y1, y2, y3,             \
636                             y4, y5, y6, y7);            \
637                                                         \
638         aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
639         aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
640         aria_store_state_8way(x0, x1, x2, x3,           \
641                               x4, x5, x6, x7,           \
642                               mem_tmp, 8);              \
643                                                         \
644         aria_load_state_8way(x0, x1, x2, x3,            \
645                              x4, x5, x6, x7,            \
646                              mem_tmp, 0);               \
647         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
648                       y0, y7, y2, rk, 0, round);        \
649                                                         \
650         aria_sbox_8way_gfni(x0, x1, x2, x3,             \
651                             x4, x5, x6, x7,             \
652                             y0, y1, y2, y3,             \
653                             y4, y5, y6, y7);            \
654                                                         \
655         aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
656         aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
657         aria_store_state_8way(x0, x1, x2, x3,           \
658                               x4, x5, x6, x7,           \
659                               mem_tmp, 0);              \
660         aria_load_state_8way(y0, y1, y2, y3,            \
661                              y4, y5, y6, y7,            \
662                              mem_tmp, 8);               \
663         aria_diff_word(x0, x1, x2, x3,                  \
664                        x4, x5, x6, x7,                  \
665                        y0, y1, y2, y3,                  \
666                        y4, y5, y6, y7);                 \
667         /* aria_diff_byte()                             \
668          * T1 = ABCD -> BADC                            \
669          * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6        \
670          * T2 = ABCD -> CDAB                            \
671          * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1       \
672          * T3 = ABCD -> DCBA                            \
673          * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4        \
674          */                                             \
675         aria_diff_word(x0, x1, x2, x3,                  \
676                        x5, x4, x7, x6,                  \
677                        y2, y3, y0, y1,                  \
678                        y7, y6, y5, y4);                 \
679         aria_store_state_8way(x3, x2, x1, x0,           \
680                               x6, x7, x4, x5,           \
681                               mem_tmp, 0);
682
683 #define aria_ff_gfni(x0, x1, x2, x3,                    \
684                 x4, x5, x6, x7,                         \
685                 y0, y1, y2, y3,                         \
686                 y4, y5, y6, y7,                         \
687                 mem_tmp, rk, round, last_round)         \
688         vpxor y7, y7, y7;                               \
689         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
690                       y0, y7, y2, rk, 8, round);        \
691                                                         \
692         aria_sbox_8way_gfni(x2, x3, x0, x1,             \
693                             x6, x7, x4, x5,             \
694                             y0, y1, y2, y3,             \
695                             y4, y5, y6, y7);            \
696                                                         \
697         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
698                       y0, y7, y2, rk, 8, last_round);   \
699                                                         \
700         aria_store_state_8way(x0, x1, x2, x3,           \
701                               x4, x5, x6, x7,           \
702                               mem_tmp, 8);              \
703                                                         \
704         aria_load_state_8way(x0, x1, x2, x3,            \
705                              x4, x5, x6, x7,            \
706                              mem_tmp, 0);               \
707         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
708                       y0, y7, y2, rk, 0, round);        \
709                                                         \
710         aria_sbox_8way_gfni(x2, x3, x0, x1,             \
711                             x6, x7, x4, x5,             \
712                             y0, y1, y2, y3,             \
713                             y4, y5, y6, y7);            \
714                                                         \
715         aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
716                       y0, y7, y2, rk, 0, last_round);   \
717                                                         \
718         aria_load_state_8way(y0, y1, y2, y3,            \
719                              y4, y5, y6, y7,            \
720                              mem_tmp, 8);
721
722 #endif /* CONFIG_AS_GFNI */
723
724 /* NB: section is mergeable, all elements must be aligned 16-byte blocks */
725 .section        .rodata.cst16, "aM", @progbits, 16
726 .align 16
727
728 #define SHUFB_BYTES(idx) \
729         0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
730
731 .Lshufb_16x16b:
732         .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
733 /* For isolating SubBytes from AESENCLAST, inverse shift row */
734 .Linv_shift_row:
735         .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
736         .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
737 .Lshift_row:
738         .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
739         .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
740 /* For CTR-mode IV byteswap */
741 .Lbswap128_mask:
742         .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
743         .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
744
745 /* AES inverse affine and S2 combined:
746  *      1 1 0 0 0 0 0 1     x0     0
747  *      0 1 0 0 1 0 0 0     x1     0
748  *      1 1 0 0 1 1 1 1     x2     0
749  *      0 1 1 0 1 0 0 1     x3     1
750  *      0 1 0 0 1 1 0 0  *  x4  +  0
751  *      0 1 0 1 1 0 0 0     x5     0
752  *      0 0 0 0 0 1 0 1     x6     0
753  *      1 1 1 0 0 1 1 1     x7     1
754  */
755 .Ltf_lo__inv_aff__and__s2:
756         .octa 0x92172DA81A9FA520B2370D883ABF8500
757 .Ltf_hi__inv_aff__and__s2:
758         .octa 0x2B15FFC1AF917B45E6D8320C625CB688
759
760 /* X2 and AES forward affine combined:
761  *      1 0 1 1 0 0 0 1     x0     0
762  *      0 1 1 1 1 0 1 1     x1     0
763  *      0 0 0 1 1 0 1 0     x2     1
764  *      0 1 0 0 0 1 0 0     x3     0
765  *      0 0 1 1 1 0 1 1  *  x4  +  0
766  *      0 1 0 0 1 0 0 0     x5     0
767  *      1 1 0 1 0 0 1 1     x6     0
768  *      0 1 0 0 1 0 1 0     x7     0
769  */
770 .Ltf_lo__x2__and__fwd_aff:
771         .octa 0xEFAE0544FCBD1657B8F95213ABEA4100
772 .Ltf_hi__x2__and__fwd_aff:
773         .octa 0x3F893781E95FE1576CDA64D2BA0CB204
774
775 #ifdef CONFIG_AS_GFNI
776 .section        .rodata.cst8, "aM", @progbits, 8
777 .align 8
778 /* AES affine: */
779 #define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
780 .Ltf_aff_bitmatrix:
781         .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
782                     BV8(1, 1, 0, 0, 0, 1, 1, 1),
783                     BV8(1, 1, 1, 0, 0, 0, 1, 1),
784                     BV8(1, 1, 1, 1, 0, 0, 0, 1),
785                     BV8(1, 1, 1, 1, 1, 0, 0, 0),
786                     BV8(0, 1, 1, 1, 1, 1, 0, 0),
787                     BV8(0, 0, 1, 1, 1, 1, 1, 0),
788                     BV8(0, 0, 0, 1, 1, 1, 1, 1))
789         .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
790                     BV8(1, 1, 0, 0, 0, 1, 1, 1),
791                     BV8(1, 1, 1, 0, 0, 0, 1, 1),
792                     BV8(1, 1, 1, 1, 0, 0, 0, 1),
793                     BV8(1, 1, 1, 1, 1, 0, 0, 0),
794                     BV8(0, 1, 1, 1, 1, 1, 0, 0),
795                     BV8(0, 0, 1, 1, 1, 1, 1, 0),
796                     BV8(0, 0, 0, 1, 1, 1, 1, 1))
797
798 /* AES inverse affine: */
799 #define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
800 .Ltf_inv_bitmatrix:
801         .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
802                     BV8(1, 0, 0, 1, 0, 0, 1, 0),
803                     BV8(0, 1, 0, 0, 1, 0, 0, 1),
804                     BV8(1, 0, 1, 0, 0, 1, 0, 0),
805                     BV8(0, 1, 0, 1, 0, 0, 1, 0),
806                     BV8(0, 0, 1, 0, 1, 0, 0, 1),
807                     BV8(1, 0, 0, 1, 0, 1, 0, 0),
808                     BV8(0, 1, 0, 0, 1, 0, 1, 0))
809         .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
810                     BV8(1, 0, 0, 1, 0, 0, 1, 0),
811                     BV8(0, 1, 0, 0, 1, 0, 0, 1),
812                     BV8(1, 0, 1, 0, 0, 1, 0, 0),
813                     BV8(0, 1, 0, 1, 0, 0, 1, 0),
814                     BV8(0, 0, 1, 0, 1, 0, 0, 1),
815                     BV8(1, 0, 0, 1, 0, 1, 0, 0),
816                     BV8(0, 1, 0, 0, 1, 0, 1, 0))
817
818 /* S2: */
819 #define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
820 .Ltf_s2_bitmatrix:
821         .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
822                     BV8(0, 0, 1, 1, 1, 1, 1, 1),
823                     BV8(1, 1, 1, 0, 1, 1, 0, 1),
824                     BV8(1, 1, 0, 0, 0, 0, 1, 1),
825                     BV8(0, 1, 0, 0, 0, 0, 1, 1),
826                     BV8(1, 1, 0, 0, 1, 1, 1, 0),
827                     BV8(0, 1, 1, 0, 0, 0, 1, 1),
828                     BV8(1, 1, 1, 1, 0, 1, 1, 0))
829         .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
830                     BV8(0, 0, 1, 1, 1, 1, 1, 1),
831                     BV8(1, 1, 1, 0, 1, 1, 0, 1),
832                     BV8(1, 1, 0, 0, 0, 0, 1, 1),
833                     BV8(0, 1, 0, 0, 0, 0, 1, 1),
834                     BV8(1, 1, 0, 0, 1, 1, 1, 0),
835                     BV8(0, 1, 1, 0, 0, 0, 1, 1),
836                     BV8(1, 1, 1, 1, 0, 1, 1, 0))
837
838 /* X2: */
839 #define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
840 .Ltf_x2_bitmatrix:
841         .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
842                     BV8(0, 0, 1, 0, 0, 1, 1, 0),
843                     BV8(0, 0, 0, 0, 1, 0, 1, 0),
844                     BV8(1, 1, 1, 0, 0, 0, 1, 1),
845                     BV8(1, 1, 1, 0, 1, 1, 0, 0),
846                     BV8(0, 1, 1, 0, 1, 0, 1, 1),
847                     BV8(1, 0, 1, 1, 1, 1, 0, 1),
848                     BV8(1, 0, 0, 1, 0, 0, 1, 1))
849         .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
850                     BV8(0, 0, 1, 0, 0, 1, 1, 0),
851                     BV8(0, 0, 0, 0, 1, 0, 1, 0),
852                     BV8(1, 1, 1, 0, 0, 0, 1, 1),
853                     BV8(1, 1, 1, 0, 1, 1, 0, 0),
854                     BV8(0, 1, 1, 0, 1, 0, 1, 1),
855                     BV8(1, 0, 1, 1, 1, 1, 0, 1),
856                     BV8(1, 0, 0, 1, 0, 0, 1, 1))
857
858 /* Identity matrix: */
859 .Ltf_id_bitmatrix:
860         .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
861                     BV8(0, 1, 0, 0, 0, 0, 0, 0),
862                     BV8(0, 0, 1, 0, 0, 0, 0, 0),
863                     BV8(0, 0, 0, 1, 0, 0, 0, 0),
864                     BV8(0, 0, 0, 0, 1, 0, 0, 0),
865                     BV8(0, 0, 0, 0, 0, 1, 0, 0),
866                     BV8(0, 0, 0, 0, 0, 0, 1, 0),
867                     BV8(0, 0, 0, 0, 0, 0, 0, 1))
868         .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
869                     BV8(0, 1, 0, 0, 0, 0, 0, 0),
870                     BV8(0, 0, 1, 0, 0, 0, 0, 0),
871                     BV8(0, 0, 0, 1, 0, 0, 0, 0),
872                     BV8(0, 0, 0, 0, 1, 0, 0, 0),
873                     BV8(0, 0, 0, 0, 0, 1, 0, 0),
874                     BV8(0, 0, 0, 0, 0, 0, 1, 0),
875                     BV8(0, 0, 0, 0, 0, 0, 0, 1))
876 #endif /* CONFIG_AS_GFNI */
877
878 /* 4-bit mask */
879 .section        .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
880 .align 4
881 .L0f0f0f0f:
882         .long 0x0f0f0f0f
883
884 .text
885
886 SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
887         /* input:
888         *      %r9: rk
889         *      %rsi: dst
890         *      %rdx: src
891         *      %xmm0..%xmm15: 16 byte-sliced blocks
892         */
893
894         FRAME_BEGIN
895
896         movq %rsi, %rax;
897         leaq 8 * 16(%rax), %r8;
898
899         inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
900                       %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
901                       %xmm15, %rax, %r8);
902         aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
903                 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
904                 %rax, %r9, 0);
905         aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
906                 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
907                 %xmm15, %rax, %r9, 1);
908         aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
909                 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
910                 %rax, %r9, 2);
911         aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
912                 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
913                 %xmm15, %rax, %r9, 3);
914         aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
915                 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
916                 %rax, %r9, 4);
917         aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
918                 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
919                 %xmm15, %rax, %r9, 5);
920         aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
921                 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
922                 %rax, %r9, 6);
923         aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
924                 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
925                 %xmm15, %rax, %r9, 7);
926         aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
927                 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
928                 %rax, %r9, 8);
929         aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
930                 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
931                 %xmm15, %rax, %r9, 9);
932         aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
933                 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
934                 %rax, %r9, 10);
935         cmpl $12, ARIA_CTX_rounds(CTX);
936         jne .Laria_192;
937         aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
938                 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
939                 %xmm15, %rax, %r9, 11, 12);
940         jmp .Laria_end;
941 .Laria_192:
942         aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
943                 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
944                 %xmm15, %rax, %r9, 11);
945         aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
946                 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
947                 %rax, %r9, 12);
948         cmpl $14, ARIA_CTX_rounds(CTX);
949         jne .Laria_256;
950         aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
951                 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
952                 %xmm15, %rax, %r9, 13, 14);
953         jmp .Laria_end;
954 .Laria_256:
955         aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
956                 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
957                 %xmm15, %rax, %r9, 13);
958         aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
959                 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
960                 %rax, %r9, 14);
961         aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
962                 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
963                 %xmm15, %rax, %r9, 15, 16);
964 .Laria_end:
965         debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
966                            %xmm9, %xmm13, %xmm0, %xmm5,
967                            %xmm10, %xmm14, %xmm3, %xmm6,
968                            %xmm11, %xmm15, %xmm2, %xmm7,
969                            (%rax), (%r8));
970
971         FRAME_END
972         RET;
973 SYM_FUNC_END(__aria_aesni_avx_crypt_16way)
974
975 SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)
976         /* input:
977         *      %rdi: ctx, CTX
978         *      %rsi: dst
979         *      %rdx: src
980         */
981
982         FRAME_BEGIN
983
984         leaq ARIA_CTX_enc_key(CTX), %r9;
985
986         inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
987                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
988                      %xmm15, %rdx);
989
990         call __aria_aesni_avx_crypt_16way;
991
992         write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
993                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
994                      %xmm15, %rax);
995
996         FRAME_END
997         RET;
998 SYM_FUNC_END(aria_aesni_avx_encrypt_16way)
999
1000 SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)
1001         /* input:
1002         *      %rdi: ctx, CTX
1003         *      %rsi: dst
1004         *      %rdx: src
1005         */
1006
1007         FRAME_BEGIN
1008
1009         leaq ARIA_CTX_dec_key(CTX), %r9;
1010
1011         inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1012                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1013                      %xmm15, %rdx);
1014
1015         call __aria_aesni_avx_crypt_16way;
1016
1017         write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1018                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1019                      %xmm15, %rax);
1020
1021         FRAME_END
1022         RET;
1023 SYM_FUNC_END(aria_aesni_avx_decrypt_16way)
1024
1025 SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)
1026         /* input:
1027         *      %rdi: ctx
1028         *      %rsi: dst
1029         *      %rdx: src
1030         *      %rcx: keystream
1031         *      %r8: iv (big endian, 128bit)
1032         */
1033
1034         FRAME_BEGIN
1035         /* load IV and byteswap */
1036         vmovdqu (%r8), %xmm8;
1037
1038         vmovdqa .Lbswap128_mask (%rip), %xmm1;
1039         vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
1040
1041         vpcmpeqd %xmm0, %xmm0, %xmm0;
1042         vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
1043
1044         /* construct IVs */
1045         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1046         vpshufb %xmm1, %xmm3, %xmm9;
1047         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1048         vpshufb %xmm1, %xmm3, %xmm10;
1049         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1050         vpshufb %xmm1, %xmm3, %xmm11;
1051         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1052         vpshufb %xmm1, %xmm3, %xmm12;
1053         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1054         vpshufb %xmm1, %xmm3, %xmm13;
1055         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1056         vpshufb %xmm1, %xmm3, %xmm14;
1057         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1058         vpshufb %xmm1, %xmm3, %xmm15;
1059         vmovdqu %xmm8, (0 * 16)(%rcx);
1060         vmovdqu %xmm9, (1 * 16)(%rcx);
1061         vmovdqu %xmm10, (2 * 16)(%rcx);
1062         vmovdqu %xmm11, (3 * 16)(%rcx);
1063         vmovdqu %xmm12, (4 * 16)(%rcx);
1064         vmovdqu %xmm13, (5 * 16)(%rcx);
1065         vmovdqu %xmm14, (6 * 16)(%rcx);
1066         vmovdqu %xmm15, (7 * 16)(%rcx);
1067
1068         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1069         vpshufb %xmm1, %xmm3, %xmm8;
1070         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1071         vpshufb %xmm1, %xmm3, %xmm9;
1072         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1073         vpshufb %xmm1, %xmm3, %xmm10;
1074         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1075         vpshufb %xmm1, %xmm3, %xmm11;
1076         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1077         vpshufb %xmm1, %xmm3, %xmm12;
1078         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1079         vpshufb %xmm1, %xmm3, %xmm13;
1080         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1081         vpshufb %xmm1, %xmm3, %xmm14;
1082         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1083         vpshufb %xmm1, %xmm3, %xmm15;
1084         inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1085         vpshufb %xmm1, %xmm3, %xmm4;
1086         vmovdqu %xmm4, (%r8);
1087
1088         vmovdqu (0 * 16)(%rcx), %xmm0;
1089         vmovdqu (1 * 16)(%rcx), %xmm1;
1090         vmovdqu (2 * 16)(%rcx), %xmm2;
1091         vmovdqu (3 * 16)(%rcx), %xmm3;
1092         vmovdqu (4 * 16)(%rcx), %xmm4;
1093         vmovdqu (5 * 16)(%rcx), %xmm5;
1094         vmovdqu (6 * 16)(%rcx), %xmm6;
1095         vmovdqu (7 * 16)(%rcx), %xmm7;
1096
1097         FRAME_END
1098         RET;
1099 SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)
1100
1101 SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
1102         /* input:
1103         *      %rdi: ctx
1104         *      %rsi: dst
1105         *      %rdx: src
1106         *      %rcx: keystream
1107         *      %r8: iv (big endian, 128bit)
1108         */
1109         FRAME_BEGIN
1110
1111         call __aria_aesni_avx_ctr_gen_keystream_16way;
1112
1113         leaq (%rsi), %r10;
1114         leaq (%rdx), %r11;
1115         leaq (%rcx), %rsi;
1116         leaq (%rcx), %rdx;
1117         leaq ARIA_CTX_enc_key(CTX), %r9;
1118
1119         call __aria_aesni_avx_crypt_16way;
1120
1121         vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1122         vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1123         vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1124         vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1125         vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1126         vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1127         vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1128         vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1129         vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1130         vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1131         vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1132         vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1133         vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1134         vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1135         vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1136         vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1137         write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1138                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1139                      %xmm15, %r10);
1140
1141         FRAME_END
1142         RET;
1143 SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)
1144
1145 #ifdef CONFIG_AS_GFNI
1146 SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
1147         /* input:
1148         *      %r9: rk
1149         *      %rsi: dst
1150         *      %rdx: src
1151         *      %xmm0..%xmm15: 16 byte-sliced blocks
1152         */
1153
1154         FRAME_BEGIN
1155
1156         movq %rsi, %rax;
1157         leaq 8 * 16(%rax), %r8;
1158
1159         inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
1160                       %xmm4, %xmm5, %xmm6, %xmm7,
1161                       %xmm8, %xmm9, %xmm10, %xmm11,
1162                       %xmm12, %xmm13, %xmm14,
1163                       %xmm15, %rax, %r8);
1164         aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
1165                      %xmm12, %xmm13, %xmm14, %xmm15,
1166                      %xmm0, %xmm1, %xmm2, %xmm3,
1167                      %xmm4, %xmm5, %xmm6, %xmm7,
1168                      %rax, %r9, 0);
1169         aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1170                      %xmm4, %xmm5, %xmm6, %xmm7,
1171                      %xmm8, %xmm9, %xmm10, %xmm11,
1172                      %xmm12, %xmm13, %xmm14,
1173                      %xmm15, %rax, %r9, 1);
1174         aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1175                      %xmm12, %xmm13, %xmm14, %xmm15,
1176                      %xmm0, %xmm1, %xmm2, %xmm3,
1177                      %xmm4, %xmm5, %xmm6, %xmm7,
1178                      %rax, %r9, 2);
1179         aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1180                      %xmm4, %xmm5, %xmm6, %xmm7,
1181                      %xmm8, %xmm9, %xmm10, %xmm11,
1182                      %xmm12, %xmm13, %xmm14,
1183                      %xmm15, %rax, %r9, 3);
1184         aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1185                      %xmm12, %xmm13, %xmm14, %xmm15,
1186                      %xmm0, %xmm1, %xmm2, %xmm3,
1187                      %xmm4, %xmm5, %xmm6, %xmm7,
1188                      %rax, %r9, 4);
1189         aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1190                      %xmm4, %xmm5, %xmm6, %xmm7,
1191                      %xmm8, %xmm9, %xmm10, %xmm11,
1192                      %xmm12, %xmm13, %xmm14,
1193                      %xmm15, %rax, %r9, 5);
1194         aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1195                      %xmm12, %xmm13, %xmm14, %xmm15,
1196                      %xmm0, %xmm1, %xmm2, %xmm3,
1197                      %xmm4, %xmm5, %xmm6, %xmm7,
1198                      %rax, %r9, 6);
1199         aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1200                      %xmm4, %xmm5, %xmm6, %xmm7,
1201                      %xmm8, %xmm9, %xmm10, %xmm11,
1202                      %xmm12, %xmm13, %xmm14,
1203                      %xmm15, %rax, %r9, 7);
1204         aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1205                      %xmm12, %xmm13, %xmm14, %xmm15,
1206                      %xmm0, %xmm1, %xmm2, %xmm3,
1207                      %xmm4, %xmm5, %xmm6, %xmm7,
1208                      %rax, %r9, 8);
1209         aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1210                      %xmm4, %xmm5, %xmm6, %xmm7,
1211                      %xmm8, %xmm9, %xmm10, %xmm11,
1212                      %xmm12, %xmm13, %xmm14,
1213                      %xmm15, %rax, %r9, 9);
1214         aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1215                      %xmm12, %xmm13, %xmm14, %xmm15,
1216                      %xmm0, %xmm1, %xmm2, %xmm3,
1217                      %xmm4, %xmm5, %xmm6, %xmm7,
1218                      %rax, %r9, 10);
1219         cmpl $12, ARIA_CTX_rounds(CTX);
1220         jne .Laria_gfni_192;
1221         aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1222                 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1223                 %xmm15, %rax, %r9, 11, 12);
1224         jmp .Laria_gfni_end;
1225 .Laria_gfni_192:
1226         aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1227                      %xmm4, %xmm5, %xmm6, %xmm7,
1228                      %xmm8, %xmm9, %xmm10, %xmm11,
1229                      %xmm12, %xmm13, %xmm14,
1230                      %xmm15, %rax, %r9, 11);
1231         aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1232                      %xmm12, %xmm13, %xmm14, %xmm15,
1233                      %xmm0, %xmm1, %xmm2, %xmm3,
1234                      %xmm4, %xmm5, %xmm6, %xmm7,
1235                      %rax, %r9, 12);
1236         cmpl $14, ARIA_CTX_rounds(CTX);
1237         jne .Laria_gfni_256;
1238         aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1239                      %xmm4, %xmm5, %xmm6, %xmm7,
1240                      %xmm8, %xmm9, %xmm10, %xmm11,
1241                      %xmm12, %xmm13, %xmm14,
1242                      %xmm15, %rax, %r9, 13, 14);
1243         jmp .Laria_gfni_end;
1244 .Laria_gfni_256:
1245         aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1246                      %xmm4, %xmm5, %xmm6, %xmm7,
1247                      %xmm8, %xmm9, %xmm10, %xmm11,
1248                      %xmm12, %xmm13, %xmm14,
1249                      %xmm15, %rax, %r9, 13);
1250         aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1251                      %xmm12, %xmm13, %xmm14, %xmm15,
1252                      %xmm0, %xmm1, %xmm2, %xmm3,
1253                      %xmm4, %xmm5, %xmm6, %xmm7,
1254                      %rax, %r9, 14);
1255         aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1256                      %xmm4, %xmm5, %xmm6, %xmm7,
1257                      %xmm8, %xmm9, %xmm10, %xmm11,
1258                      %xmm12, %xmm13, %xmm14,
1259                      %xmm15, %rax, %r9, 15, 16);
1260 .Laria_gfni_end:
1261         debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
1262                            %xmm9, %xmm13, %xmm0, %xmm5,
1263                            %xmm10, %xmm14, %xmm3, %xmm6,
1264                            %xmm11, %xmm15, %xmm2, %xmm7,
1265                            (%rax), (%r8));
1266
1267         FRAME_END
1268         RET;
1269 SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)
1270
1271 SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
1272         /* input:
1273         *      %rdi: ctx, CTX
1274         *      %rsi: dst
1275         *      %rdx: src
1276         */
1277
1278         FRAME_BEGIN
1279
1280         leaq ARIA_CTX_enc_key(CTX), %r9;
1281
1282         inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1283                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1284                      %xmm15, %rdx);
1285
1286         call __aria_aesni_avx_gfni_crypt_16way;
1287
1288         write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1289                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1290                      %xmm15, %rax);
1291
1292         FRAME_END
1293         RET;
1294 SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)
1295
1296 SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
1297         /* input:
1298         *      %rdi: ctx, CTX
1299         *      %rsi: dst
1300         *      %rdx: src
1301         */
1302
1303         FRAME_BEGIN
1304
1305         leaq ARIA_CTX_dec_key(CTX), %r9;
1306
1307         inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1308                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1309                      %xmm15, %rdx);
1310
1311         call __aria_aesni_avx_gfni_crypt_16way;
1312
1313         write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1314                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1315                      %xmm15, %rax);
1316
1317         FRAME_END
1318         RET;
1319 SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)
1320
1321 SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
1322         /* input:
1323         *      %rdi: ctx
1324         *      %rsi: dst
1325         *      %rdx: src
1326         *      %rcx: keystream
1327         *      %r8: iv (big endian, 128bit)
1328         */
1329         FRAME_BEGIN
1330
1331         call __aria_aesni_avx_ctr_gen_keystream_16way
1332
1333         leaq (%rsi), %r10;
1334         leaq (%rdx), %r11;
1335         leaq (%rcx), %rsi;
1336         leaq (%rcx), %rdx;
1337         leaq ARIA_CTX_enc_key(CTX), %r9;
1338
1339         call __aria_aesni_avx_gfni_crypt_16way;
1340
1341         vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1342         vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1343         vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1344         vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1345         vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1346         vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1347         vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1348         vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1349         vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1350         vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1351         vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1352         vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1353         vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1354         vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1355         vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1356         vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1357         write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1358                      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1359                      %xmm15, %r10);
1360
1361         FRAME_END
1362         RET;
1363 SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)
1364 #endif /* CONFIG_AS_GFNI */