1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * ARIA Cipher 16-way parallel algorithm (AVX)
5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
9 #include <linux/linkage.h>
10 #include <linux/cfi_types.h>
11 #include <asm/asm-offsets.h>
12 #include <asm/frame.h>
18 #define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \
19 ( (((a0) & 1) << 0) | \
28 #define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \
29 ( ((l7) << (0 * 8)) | \
38 #define inc_le128(x, minus_one, tmp) \
39 vpcmpeqq minus_one, x, tmp; \
40 vpsubq minus_one, x, x; \
41 vpslldq $8, tmp, tmp; \
44 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
45 vpand x, mask4bit, tmp0; \
46 vpandn x, mask4bit, x; \
49 vpshufb tmp0, lo_t, tmp0; \
53 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
54 vpunpckhdq x1, x0, t2; \
55 vpunpckldq x1, x0, x0; \
57 vpunpckldq x3, x2, t1; \
58 vpunpckhdq x3, x2, x2; \
60 vpunpckhqdq t1, x0, x1; \
61 vpunpcklqdq t1, x0, x0; \
63 vpunpckhqdq x2, t2, x3; \
64 vpunpcklqdq x2, t2, x2;
66 #define byteslice_16x16b(a0, b0, c0, d0, \
73 transpose_4x4(a0, a1, a2, a3, d2, d3); \
74 transpose_4x4(b0, b1, b2, b3, d2, d3); \
80 transpose_4x4(c0, c1, c2, c3, a0, a1); \
81 transpose_4x4(d0, d1, d2, d3, a0, a1); \
83 vmovdqu .Lshufb_16x16b(%rip), a0; \
102 vpshufb a0, d3, a0; \
105 transpose_4x4(a0, b0, c0, d0, d2, d3); \
106 transpose_4x4(a1, b1, c1, d1, d2, d3); \
112 transpose_4x4(a2, b2, c2, d2, b0, b1); \
113 transpose_4x4(a3, b3, c3, d3, b0, b1); \
116 /* does not adjust output bytes inside vectors */
118 #define debyteslice_16x16b(a0, b0, c0, d0, \
125 transpose_4x4(a0, a1, a2, a3, d2, d3); \
126 transpose_4x4(b0, b1, b2, b3, d2, d3); \
132 transpose_4x4(c0, c1, c2, c3, a0, a1); \
133 transpose_4x4(d0, d1, d2, d3, a0, a1); \
135 vmovdqu .Lshufb_16x16b(%rip), a0; \
137 vpshufb a0, a2, a2; \
138 vpshufb a0, a3, a3; \
139 vpshufb a0, b0, b0; \
140 vpshufb a0, b1, b1; \
141 vpshufb a0, b2, b2; \
142 vpshufb a0, b3, b3; \
143 vpshufb a0, a1, a1; \
144 vpshufb a0, c0, c0; \
145 vpshufb a0, c1, c1; \
146 vpshufb a0, c2, c2; \
147 vpshufb a0, c3, c3; \
148 vpshufb a0, d0, d0; \
149 vpshufb a0, d1, d1; \
150 vpshufb a0, d2, d2; \
151 vpshufb a0, d3, d3; \
154 vpshufb a0, d3, a0; \
157 transpose_4x4(c0, d0, a0, b0, d2, d3); \
158 transpose_4x4(c1, d1, a1, b1, d2, d3); \
164 transpose_4x4(c2, d2, a2, b2, b0, b1); \
165 transpose_4x4(c3, d3, a3, b3, b0, b1); \
168 /* does not adjust output bytes inside vectors */
170 /* load blocks to registers and apply pre-whitening */
171 #define inpack16_pre(x0, x1, x2, x3, \
176 vmovdqu (0 * 16)(rio), x0; \
177 vmovdqu (1 * 16)(rio), x1; \
178 vmovdqu (2 * 16)(rio), x2; \
179 vmovdqu (3 * 16)(rio), x3; \
180 vmovdqu (4 * 16)(rio), x4; \
181 vmovdqu (5 * 16)(rio), x5; \
182 vmovdqu (6 * 16)(rio), x6; \
183 vmovdqu (7 * 16)(rio), x7; \
184 vmovdqu (8 * 16)(rio), y0; \
185 vmovdqu (9 * 16)(rio), y1; \
186 vmovdqu (10 * 16)(rio), y2; \
187 vmovdqu (11 * 16)(rio), y3; \
188 vmovdqu (12 * 16)(rio), y4; \
189 vmovdqu (13 * 16)(rio), y5; \
190 vmovdqu (14 * 16)(rio), y6; \
191 vmovdqu (15 * 16)(rio), y7;
193 /* byteslice pre-whitened blocks and store to temporary memory */
194 #define inpack16_post(x0, x1, x2, x3, \
199 byteslice_16x16b(x0, x1, x2, x3, \
203 (mem_ab), (mem_cd)); \
205 vmovdqu x0, 0 * 16(mem_ab); \
206 vmovdqu x1, 1 * 16(mem_ab); \
207 vmovdqu x2, 2 * 16(mem_ab); \
208 vmovdqu x3, 3 * 16(mem_ab); \
209 vmovdqu x4, 4 * 16(mem_ab); \
210 vmovdqu x5, 5 * 16(mem_ab); \
211 vmovdqu x6, 6 * 16(mem_ab); \
212 vmovdqu x7, 7 * 16(mem_ab); \
213 vmovdqu y0, 0 * 16(mem_cd); \
214 vmovdqu y1, 1 * 16(mem_cd); \
215 vmovdqu y2, 2 * 16(mem_cd); \
216 vmovdqu y3, 3 * 16(mem_cd); \
217 vmovdqu y4, 4 * 16(mem_cd); \
218 vmovdqu y5, 5 * 16(mem_cd); \
219 vmovdqu y6, 6 * 16(mem_cd); \
220 vmovdqu y7, 7 * 16(mem_cd);
222 #define write_output(x0, x1, x2, x3, \
227 vmovdqu x0, 0 * 16(mem); \
228 vmovdqu x1, 1 * 16(mem); \
229 vmovdqu x2, 2 * 16(mem); \
230 vmovdqu x3, 3 * 16(mem); \
231 vmovdqu x4, 4 * 16(mem); \
232 vmovdqu x5, 5 * 16(mem); \
233 vmovdqu x6, 6 * 16(mem); \
234 vmovdqu x7, 7 * 16(mem); \
235 vmovdqu y0, 8 * 16(mem); \
236 vmovdqu y1, 9 * 16(mem); \
237 vmovdqu y2, 10 * 16(mem); \
238 vmovdqu y3, 11 * 16(mem); \
239 vmovdqu y4, 12 * 16(mem); \
240 vmovdqu y5, 13 * 16(mem); \
241 vmovdqu y6, 14 * 16(mem); \
242 vmovdqu y7, 15 * 16(mem); \
244 #define aria_store_state_8way(x0, x1, x2, x3, \
247 vmovdqu x0, ((idx + 0) * 16)(mem_tmp); \
248 vmovdqu x1, ((idx + 1) * 16)(mem_tmp); \
249 vmovdqu x2, ((idx + 2) * 16)(mem_tmp); \
250 vmovdqu x3, ((idx + 3) * 16)(mem_tmp); \
251 vmovdqu x4, ((idx + 4) * 16)(mem_tmp); \
252 vmovdqu x5, ((idx + 5) * 16)(mem_tmp); \
253 vmovdqu x6, ((idx + 6) * 16)(mem_tmp); \
254 vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
256 #define aria_load_state_8way(x0, x1, x2, x3, \
259 vmovdqu ((idx + 0) * 16)(mem_tmp), x0; \
260 vmovdqu ((idx + 1) * 16)(mem_tmp), x1; \
261 vmovdqu ((idx + 2) * 16)(mem_tmp), x2; \
262 vmovdqu ((idx + 3) * 16)(mem_tmp), x3; \
263 vmovdqu ((idx + 4) * 16)(mem_tmp), x4; \
264 vmovdqu ((idx + 5) * 16)(mem_tmp), x5; \
265 vmovdqu ((idx + 6) * 16)(mem_tmp), x6; \
266 vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
268 #define aria_ark_8way(x0, x1, x2, x3, \
273 vbroadcastss ((round * 16) + idx + 0)(rk), t0; \
274 vpsrld $24, t0, t2; \
275 vpshufb t1, t2, t2; \
277 vpsrld $16, t0, t2; \
278 vpshufb t1, t2, t2; \
281 vpshufb t1, t2, t2; \
283 vpshufb t1, t0, t2; \
285 vbroadcastss ((round * 16) + idx + 4)(rk), t0; \
286 vpsrld $24, t0, t2; \
287 vpshufb t1, t2, t2; \
289 vpsrld $16, t0, t2; \
290 vpshufb t1, t2, t2; \
293 vpshufb t1, t2, t2; \
295 vpshufb t1, t0, t2; \
298 #ifdef CONFIG_AS_GFNI
299 #define aria_sbox_8way_gfni(x0, x1, x2, x3, \
303 vmovdqa .Ltf_s2_bitmatrix(%rip), t0; \
304 vmovdqa .Ltf_inv_bitmatrix(%rip), t1; \
305 vmovdqa .Ltf_id_bitmatrix(%rip), t2; \
306 vmovdqa .Ltf_aff_bitmatrix(%rip), t3; \
307 vmovdqa .Ltf_x2_bitmatrix(%rip), t4; \
308 vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
309 vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
310 vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
311 vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
312 vgf2p8affineinvqb $0, t2, x2, x2; \
313 vgf2p8affineinvqb $0, t2, x6, x6; \
314 vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
315 vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
316 vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
317 vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
318 vgf2p8affineinvqb $0, t2, x3, x3; \
319 vgf2p8affineinvqb $0, t2, x7, x7
321 #endif /* CONFIG_AS_GFNI */
323 #define aria_sbox_8way(x0, x1, x2, x3, \
327 vmovdqa .Linv_shift_row(%rip), t0; \
328 vmovdqa .Lshift_row(%rip), t1; \
329 vbroadcastss .L0f0f0f0f(%rip), t6; \
330 vmovdqa .Ltf_lo__inv_aff__and__s2(%rip), t2; \
331 vmovdqa .Ltf_hi__inv_aff__and__s2(%rip), t3; \
332 vmovdqa .Ltf_lo__x2__and__fwd_aff(%rip), t4; \
333 vmovdqa .Ltf_hi__x2__and__fwd_aff(%rip), t5; \
335 vaesenclast t7, x0, x0; \
336 vaesenclast t7, x4, x4; \
337 vaesenclast t7, x1, x1; \
338 vaesenclast t7, x5, x5; \
339 vaesdeclast t7, x2, x2; \
340 vaesdeclast t7, x6, x6; \
342 /* AES inverse shift rows */ \
343 vpshufb t0, x0, x0; \
344 vpshufb t0, x4, x4; \
345 vpshufb t0, x1, x1; \
346 vpshufb t0, x5, x5; \
347 vpshufb t1, x3, x3; \
348 vpshufb t1, x7, x7; \
349 vpshufb t1, x2, x2; \
350 vpshufb t1, x6, x6; \
352 /* affine transformation for S2 */ \
353 filter_8bit(x1, t2, t3, t6, t0); \
354 /* affine transformation for S2 */ \
355 filter_8bit(x5, t2, t3, t6, t0); \
357 /* affine transformation for X2 */ \
358 filter_8bit(x3, t4, t5, t6, t0); \
359 /* affine transformation for X2 */ \
360 filter_8bit(x7, t4, t5, t6, t0); \
361 vaesdeclast t7, x3, x3; \
362 vaesdeclast t7, x7, x7;
364 #define aria_diff_m(x0, x1, x2, x3, \
366 /* T = rotr32(X, 8); */ \
372 /* X = T ^ rotr(X, 16); */ \
379 #define aria_diff_word(x0, x1, x2, x3, \
419 #define aria_fe(x0, x1, x2, x3, \
423 mem_tmp, rk, round) \
425 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
426 y0, y7, y2, rk, 8, round); \
428 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
429 y0, y1, y2, y3, y4, y5, y6, y7); \
431 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
432 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
433 aria_store_state_8way(x0, x1, x2, x3, \
437 aria_load_state_8way(x0, x1, x2, x3, \
440 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
441 y0, y7, y2, rk, 0, round); \
443 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
444 y0, y1, y2, y3, y4, y5, y6, y7); \
446 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
447 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
448 aria_store_state_8way(x0, x1, x2, x3, \
451 aria_load_state_8way(y0, y1, y2, y3, \
454 aria_diff_word(x0, x1, x2, x3, \
458 /* aria_diff_byte() \
459 * T3 = ABCD -> BADC \
460 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
461 * T0 = ABCD -> CDAB \
462 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
463 * T1 = ABCD -> DCBA \
464 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
466 aria_diff_word(x2, x3, x0, x1, \
470 aria_store_state_8way(x3, x2, x1, x0, \
474 #define aria_fo(x0, x1, x2, x3, \
478 mem_tmp, rk, round) \
480 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
481 y0, y7, y2, rk, 8, round); \
483 aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
484 y0, y1, y2, y3, y4, y5, y6, y7); \
486 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
487 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
488 aria_store_state_8way(x0, x1, x2, x3, \
492 aria_load_state_8way(x0, x1, x2, x3, \
495 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
496 y0, y7, y2, rk, 0, round); \
498 aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
499 y0, y1, y2, y3, y4, y5, y6, y7); \
501 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
502 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
503 aria_store_state_8way(x0, x1, x2, x3, \
506 aria_load_state_8way(y0, y1, y2, y3, \
509 aria_diff_word(x0, x1, x2, x3, \
513 /* aria_diff_byte() \
514 * T1 = ABCD -> BADC \
515 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
516 * T2 = ABCD -> CDAB \
517 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
518 * T3 = ABCD -> DCBA \
519 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
521 aria_diff_word(x0, x1, x2, x3, \
525 aria_store_state_8way(x3, x2, x1, x0, \
529 #define aria_ff(x0, x1, x2, x3, \
533 mem_tmp, rk, round, last_round) \
535 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
536 y0, y7, y2, rk, 8, round); \
538 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
539 y0, y1, y2, y3, y4, y5, y6, y7); \
541 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
542 y0, y7, y2, rk, 8, last_round); \
544 aria_store_state_8way(x0, x1, x2, x3, \
548 aria_load_state_8way(x0, x1, x2, x3, \
551 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
552 y0, y7, y2, rk, 0, round); \
554 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
555 y0, y1, y2, y3, y4, y5, y6, y7); \
557 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
558 y0, y7, y2, rk, 0, last_round); \
560 aria_load_state_8way(y0, y1, y2, y3, \
564 #ifdef CONFIG_AS_GFNI
565 #define aria_fe_gfni(x0, x1, x2, x3, \
569 mem_tmp, rk, round) \
571 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
572 y0, y7, y2, rk, 8, round); \
574 aria_sbox_8way_gfni(x2, x3, x0, x1, \
579 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
580 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
581 aria_store_state_8way(x0, x1, x2, x3, \
585 aria_load_state_8way(x0, x1, x2, x3, \
588 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
589 y0, y7, y2, rk, 0, round); \
591 aria_sbox_8way_gfni(x2, x3, x0, x1, \
596 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
597 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
598 aria_store_state_8way(x0, x1, x2, x3, \
601 aria_load_state_8way(y0, y1, y2, y3, \
604 aria_diff_word(x0, x1, x2, x3, \
608 /* aria_diff_byte() \
609 * T3 = ABCD -> BADC \
610 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
611 * T0 = ABCD -> CDAB \
612 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
613 * T1 = ABCD -> DCBA \
614 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
616 aria_diff_word(x2, x3, x0, x1, \
620 aria_store_state_8way(x3, x2, x1, x0, \
624 #define aria_fo_gfni(x0, x1, x2, x3, \
628 mem_tmp, rk, round) \
630 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
631 y0, y7, y2, rk, 8, round); \
633 aria_sbox_8way_gfni(x0, x1, x2, x3, \
638 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
639 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
640 aria_store_state_8way(x0, x1, x2, x3, \
644 aria_load_state_8way(x0, x1, x2, x3, \
647 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
648 y0, y7, y2, rk, 0, round); \
650 aria_sbox_8way_gfni(x0, x1, x2, x3, \
655 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
656 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
657 aria_store_state_8way(x0, x1, x2, x3, \
660 aria_load_state_8way(y0, y1, y2, y3, \
663 aria_diff_word(x0, x1, x2, x3, \
667 /* aria_diff_byte() \
668 * T1 = ABCD -> BADC \
669 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
670 * T2 = ABCD -> CDAB \
671 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
672 * T3 = ABCD -> DCBA \
673 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
675 aria_diff_word(x0, x1, x2, x3, \
679 aria_store_state_8way(x3, x2, x1, x0, \
683 #define aria_ff_gfni(x0, x1, x2, x3, \
687 mem_tmp, rk, round, last_round) \
689 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
690 y0, y7, y2, rk, 8, round); \
692 aria_sbox_8way_gfni(x2, x3, x0, x1, \
697 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
698 y0, y7, y2, rk, 8, last_round); \
700 aria_store_state_8way(x0, x1, x2, x3, \
704 aria_load_state_8way(x0, x1, x2, x3, \
707 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
708 y0, y7, y2, rk, 0, round); \
710 aria_sbox_8way_gfni(x2, x3, x0, x1, \
715 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
716 y0, y7, y2, rk, 0, last_round); \
718 aria_load_state_8way(y0, y1, y2, y3, \
722 #endif /* CONFIG_AS_GFNI */
724 /* NB: section is mergeable, all elements must be aligned 16-byte blocks */
725 .section .rodata.cst16, "aM", @progbits, 16
728 #define SHUFB_BYTES(idx) \
729 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
732 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
733 /* For isolating SubBytes from AESENCLAST, inverse shift row */
735 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
736 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
738 .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
739 .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
740 /* For CTR-mode IV byteswap */
742 .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
743 .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
745 /* AES inverse affine and S2 combined:
746 * 1 1 0 0 0 0 0 1 x0 0
747 * 0 1 0 0 1 0 0 0 x1 0
748 * 1 1 0 0 1 1 1 1 x2 0
749 * 0 1 1 0 1 0 0 1 x3 1
750 * 0 1 0 0 1 1 0 0 * x4 + 0
751 * 0 1 0 1 1 0 0 0 x5 0
752 * 0 0 0 0 0 1 0 1 x6 0
753 * 1 1 1 0 0 1 1 1 x7 1
755 .Ltf_lo__inv_aff__and__s2:
756 .octa 0x92172DA81A9FA520B2370D883ABF8500
757 .Ltf_hi__inv_aff__and__s2:
758 .octa 0x2B15FFC1AF917B45E6D8320C625CB688
760 /* X2 and AES forward affine combined:
761 * 1 0 1 1 0 0 0 1 x0 0
762 * 0 1 1 1 1 0 1 1 x1 0
763 * 0 0 0 1 1 0 1 0 x2 1
764 * 0 1 0 0 0 1 0 0 x3 0
765 * 0 0 1 1 1 0 1 1 * x4 + 0
766 * 0 1 0 0 1 0 0 0 x5 0
767 * 1 1 0 1 0 0 1 1 x6 0
768 * 0 1 0 0 1 0 1 0 x7 0
770 .Ltf_lo__x2__and__fwd_aff:
771 .octa 0xEFAE0544FCBD1657B8F95213ABEA4100
772 .Ltf_hi__x2__and__fwd_aff:
773 .octa 0x3F893781E95FE1576CDA64D2BA0CB204
775 #ifdef CONFIG_AS_GFNI
776 .section .rodata.cst8, "aM", @progbits, 8
779 #define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
781 .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
782 BV8(1, 1, 0, 0, 0, 1, 1, 1),
783 BV8(1, 1, 1, 0, 0, 0, 1, 1),
784 BV8(1, 1, 1, 1, 0, 0, 0, 1),
785 BV8(1, 1, 1, 1, 1, 0, 0, 0),
786 BV8(0, 1, 1, 1, 1, 1, 0, 0),
787 BV8(0, 0, 1, 1, 1, 1, 1, 0),
788 BV8(0, 0, 0, 1, 1, 1, 1, 1))
789 .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
790 BV8(1, 1, 0, 0, 0, 1, 1, 1),
791 BV8(1, 1, 1, 0, 0, 0, 1, 1),
792 BV8(1, 1, 1, 1, 0, 0, 0, 1),
793 BV8(1, 1, 1, 1, 1, 0, 0, 0),
794 BV8(0, 1, 1, 1, 1, 1, 0, 0),
795 BV8(0, 0, 1, 1, 1, 1, 1, 0),
796 BV8(0, 0, 0, 1, 1, 1, 1, 1))
798 /* AES inverse affine: */
799 #define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
801 .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
802 BV8(1, 0, 0, 1, 0, 0, 1, 0),
803 BV8(0, 1, 0, 0, 1, 0, 0, 1),
804 BV8(1, 0, 1, 0, 0, 1, 0, 0),
805 BV8(0, 1, 0, 1, 0, 0, 1, 0),
806 BV8(0, 0, 1, 0, 1, 0, 0, 1),
807 BV8(1, 0, 0, 1, 0, 1, 0, 0),
808 BV8(0, 1, 0, 0, 1, 0, 1, 0))
809 .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
810 BV8(1, 0, 0, 1, 0, 0, 1, 0),
811 BV8(0, 1, 0, 0, 1, 0, 0, 1),
812 BV8(1, 0, 1, 0, 0, 1, 0, 0),
813 BV8(0, 1, 0, 1, 0, 0, 1, 0),
814 BV8(0, 0, 1, 0, 1, 0, 0, 1),
815 BV8(1, 0, 0, 1, 0, 1, 0, 0),
816 BV8(0, 1, 0, 0, 1, 0, 1, 0))
819 #define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
821 .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
822 BV8(0, 0, 1, 1, 1, 1, 1, 1),
823 BV8(1, 1, 1, 0, 1, 1, 0, 1),
824 BV8(1, 1, 0, 0, 0, 0, 1, 1),
825 BV8(0, 1, 0, 0, 0, 0, 1, 1),
826 BV8(1, 1, 0, 0, 1, 1, 1, 0),
827 BV8(0, 1, 1, 0, 0, 0, 1, 1),
828 BV8(1, 1, 1, 1, 0, 1, 1, 0))
829 .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
830 BV8(0, 0, 1, 1, 1, 1, 1, 1),
831 BV8(1, 1, 1, 0, 1, 1, 0, 1),
832 BV8(1, 1, 0, 0, 0, 0, 1, 1),
833 BV8(0, 1, 0, 0, 0, 0, 1, 1),
834 BV8(1, 1, 0, 0, 1, 1, 1, 0),
835 BV8(0, 1, 1, 0, 0, 0, 1, 1),
836 BV8(1, 1, 1, 1, 0, 1, 1, 0))
839 #define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
841 .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
842 BV8(0, 0, 1, 0, 0, 1, 1, 0),
843 BV8(0, 0, 0, 0, 1, 0, 1, 0),
844 BV8(1, 1, 1, 0, 0, 0, 1, 1),
845 BV8(1, 1, 1, 0, 1, 1, 0, 0),
846 BV8(0, 1, 1, 0, 1, 0, 1, 1),
847 BV8(1, 0, 1, 1, 1, 1, 0, 1),
848 BV8(1, 0, 0, 1, 0, 0, 1, 1))
849 .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
850 BV8(0, 0, 1, 0, 0, 1, 1, 0),
851 BV8(0, 0, 0, 0, 1, 0, 1, 0),
852 BV8(1, 1, 1, 0, 0, 0, 1, 1),
853 BV8(1, 1, 1, 0, 1, 1, 0, 0),
854 BV8(0, 1, 1, 0, 1, 0, 1, 1),
855 BV8(1, 0, 1, 1, 1, 1, 0, 1),
856 BV8(1, 0, 0, 1, 0, 0, 1, 1))
858 /* Identity matrix: */
860 .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
861 BV8(0, 1, 0, 0, 0, 0, 0, 0),
862 BV8(0, 0, 1, 0, 0, 0, 0, 0),
863 BV8(0, 0, 0, 1, 0, 0, 0, 0),
864 BV8(0, 0, 0, 0, 1, 0, 0, 0),
865 BV8(0, 0, 0, 0, 0, 1, 0, 0),
866 BV8(0, 0, 0, 0, 0, 0, 1, 0),
867 BV8(0, 0, 0, 0, 0, 0, 0, 1))
868 .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
869 BV8(0, 1, 0, 0, 0, 0, 0, 0),
870 BV8(0, 0, 1, 0, 0, 0, 0, 0),
871 BV8(0, 0, 0, 1, 0, 0, 0, 0),
872 BV8(0, 0, 0, 0, 1, 0, 0, 0),
873 BV8(0, 0, 0, 0, 0, 1, 0, 0),
874 BV8(0, 0, 0, 0, 0, 0, 1, 0),
875 BV8(0, 0, 0, 0, 0, 0, 0, 1))
876 #endif /* CONFIG_AS_GFNI */
879 .section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
886 SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
891 * %xmm0..%xmm15: 16 byte-sliced blocks
897 leaq 8 * 16(%rax), %r8;
899 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
900 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
902 aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
903 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
905 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
906 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
907 %xmm15, %rax, %r9, 1);
908 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
909 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
911 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
912 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
913 %xmm15, %rax, %r9, 3);
914 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
915 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
917 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
918 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
919 %xmm15, %rax, %r9, 5);
920 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
921 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
923 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
924 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
925 %xmm15, %rax, %r9, 7);
926 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
927 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
929 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
930 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
931 %xmm15, %rax, %r9, 9);
932 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
933 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
935 cmpl $12, ARIA_CTX_rounds(CTX);
937 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
938 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
939 %xmm15, %rax, %r9, 11, 12);
942 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
943 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
944 %xmm15, %rax, %r9, 11);
945 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
946 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
948 cmpl $14, ARIA_CTX_rounds(CTX);
950 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
951 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
952 %xmm15, %rax, %r9, 13, 14);
955 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
956 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
957 %xmm15, %rax, %r9, 13);
958 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
959 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
961 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
962 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
963 %xmm15, %rax, %r9, 15, 16);
965 debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
966 %xmm9, %xmm13, %xmm0, %xmm5,
967 %xmm10, %xmm14, %xmm3, %xmm6,
968 %xmm11, %xmm15, %xmm2, %xmm7,
973 SYM_FUNC_END(__aria_aesni_avx_crypt_16way)
975 SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)
984 leaq ARIA_CTX_enc_key(CTX), %r9;
986 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
987 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
990 call __aria_aesni_avx_crypt_16way;
992 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
993 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
998 SYM_FUNC_END(aria_aesni_avx_encrypt_16way)
1000 SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)
1009 leaq ARIA_CTX_dec_key(CTX), %r9;
1011 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1012 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1015 call __aria_aesni_avx_crypt_16way;
1017 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1018 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1023 SYM_FUNC_END(aria_aesni_avx_decrypt_16way)
1025 SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)
1031 * %r8: iv (big endian, 128bit)
1035 /* load IV and byteswap */
1036 vmovdqu (%r8), %xmm8;
1038 vmovdqa .Lbswap128_mask (%rip), %xmm1;
1039 vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
1041 vpcmpeqd %xmm0, %xmm0, %xmm0;
1042 vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
1045 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1046 vpshufb %xmm1, %xmm3, %xmm9;
1047 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1048 vpshufb %xmm1, %xmm3, %xmm10;
1049 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1050 vpshufb %xmm1, %xmm3, %xmm11;
1051 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1052 vpshufb %xmm1, %xmm3, %xmm12;
1053 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1054 vpshufb %xmm1, %xmm3, %xmm13;
1055 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1056 vpshufb %xmm1, %xmm3, %xmm14;
1057 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1058 vpshufb %xmm1, %xmm3, %xmm15;
1059 vmovdqu %xmm8, (0 * 16)(%rcx);
1060 vmovdqu %xmm9, (1 * 16)(%rcx);
1061 vmovdqu %xmm10, (2 * 16)(%rcx);
1062 vmovdqu %xmm11, (3 * 16)(%rcx);
1063 vmovdqu %xmm12, (4 * 16)(%rcx);
1064 vmovdqu %xmm13, (5 * 16)(%rcx);
1065 vmovdqu %xmm14, (6 * 16)(%rcx);
1066 vmovdqu %xmm15, (7 * 16)(%rcx);
1068 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1069 vpshufb %xmm1, %xmm3, %xmm8;
1070 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1071 vpshufb %xmm1, %xmm3, %xmm9;
1072 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1073 vpshufb %xmm1, %xmm3, %xmm10;
1074 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1075 vpshufb %xmm1, %xmm3, %xmm11;
1076 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1077 vpshufb %xmm1, %xmm3, %xmm12;
1078 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1079 vpshufb %xmm1, %xmm3, %xmm13;
1080 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1081 vpshufb %xmm1, %xmm3, %xmm14;
1082 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1083 vpshufb %xmm1, %xmm3, %xmm15;
1084 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
1085 vpshufb %xmm1, %xmm3, %xmm4;
1086 vmovdqu %xmm4, (%r8);
1088 vmovdqu (0 * 16)(%rcx), %xmm0;
1089 vmovdqu (1 * 16)(%rcx), %xmm1;
1090 vmovdqu (2 * 16)(%rcx), %xmm2;
1091 vmovdqu (3 * 16)(%rcx), %xmm3;
1092 vmovdqu (4 * 16)(%rcx), %xmm4;
1093 vmovdqu (5 * 16)(%rcx), %xmm5;
1094 vmovdqu (6 * 16)(%rcx), %xmm6;
1095 vmovdqu (7 * 16)(%rcx), %xmm7;
1099 SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)
1101 SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
1107 * %r8: iv (big endian, 128bit)
1111 call __aria_aesni_avx_ctr_gen_keystream_16way;
1117 leaq ARIA_CTX_enc_key(CTX), %r9;
1119 call __aria_aesni_avx_crypt_16way;
1121 vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1122 vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1123 vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1124 vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1125 vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1126 vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1127 vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1128 vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1129 vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1130 vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1131 vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1132 vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1133 vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1134 vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1135 vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1136 vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1137 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1138 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1143 SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)
1145 #ifdef CONFIG_AS_GFNI
1146 SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
1151 * %xmm0..%xmm15: 16 byte-sliced blocks
1157 leaq 8 * 16(%rax), %r8;
1159 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
1160 %xmm4, %xmm5, %xmm6, %xmm7,
1161 %xmm8, %xmm9, %xmm10, %xmm11,
1162 %xmm12, %xmm13, %xmm14,
1164 aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
1165 %xmm12, %xmm13, %xmm14, %xmm15,
1166 %xmm0, %xmm1, %xmm2, %xmm3,
1167 %xmm4, %xmm5, %xmm6, %xmm7,
1169 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1170 %xmm4, %xmm5, %xmm6, %xmm7,
1171 %xmm8, %xmm9, %xmm10, %xmm11,
1172 %xmm12, %xmm13, %xmm14,
1173 %xmm15, %rax, %r9, 1);
1174 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1175 %xmm12, %xmm13, %xmm14, %xmm15,
1176 %xmm0, %xmm1, %xmm2, %xmm3,
1177 %xmm4, %xmm5, %xmm6, %xmm7,
1179 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1180 %xmm4, %xmm5, %xmm6, %xmm7,
1181 %xmm8, %xmm9, %xmm10, %xmm11,
1182 %xmm12, %xmm13, %xmm14,
1183 %xmm15, %rax, %r9, 3);
1184 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1185 %xmm12, %xmm13, %xmm14, %xmm15,
1186 %xmm0, %xmm1, %xmm2, %xmm3,
1187 %xmm4, %xmm5, %xmm6, %xmm7,
1189 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1190 %xmm4, %xmm5, %xmm6, %xmm7,
1191 %xmm8, %xmm9, %xmm10, %xmm11,
1192 %xmm12, %xmm13, %xmm14,
1193 %xmm15, %rax, %r9, 5);
1194 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1195 %xmm12, %xmm13, %xmm14, %xmm15,
1196 %xmm0, %xmm1, %xmm2, %xmm3,
1197 %xmm4, %xmm5, %xmm6, %xmm7,
1199 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1200 %xmm4, %xmm5, %xmm6, %xmm7,
1201 %xmm8, %xmm9, %xmm10, %xmm11,
1202 %xmm12, %xmm13, %xmm14,
1203 %xmm15, %rax, %r9, 7);
1204 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1205 %xmm12, %xmm13, %xmm14, %xmm15,
1206 %xmm0, %xmm1, %xmm2, %xmm3,
1207 %xmm4, %xmm5, %xmm6, %xmm7,
1209 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1210 %xmm4, %xmm5, %xmm6, %xmm7,
1211 %xmm8, %xmm9, %xmm10, %xmm11,
1212 %xmm12, %xmm13, %xmm14,
1213 %xmm15, %rax, %r9, 9);
1214 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1215 %xmm12, %xmm13, %xmm14, %xmm15,
1216 %xmm0, %xmm1, %xmm2, %xmm3,
1217 %xmm4, %xmm5, %xmm6, %xmm7,
1219 cmpl $12, ARIA_CTX_rounds(CTX);
1220 jne .Laria_gfni_192;
1221 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1222 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1223 %xmm15, %rax, %r9, 11, 12);
1224 jmp .Laria_gfni_end;
1226 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1227 %xmm4, %xmm5, %xmm6, %xmm7,
1228 %xmm8, %xmm9, %xmm10, %xmm11,
1229 %xmm12, %xmm13, %xmm14,
1230 %xmm15, %rax, %r9, 11);
1231 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1232 %xmm12, %xmm13, %xmm14, %xmm15,
1233 %xmm0, %xmm1, %xmm2, %xmm3,
1234 %xmm4, %xmm5, %xmm6, %xmm7,
1236 cmpl $14, ARIA_CTX_rounds(CTX);
1237 jne .Laria_gfni_256;
1238 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1239 %xmm4, %xmm5, %xmm6, %xmm7,
1240 %xmm8, %xmm9, %xmm10, %xmm11,
1241 %xmm12, %xmm13, %xmm14,
1242 %xmm15, %rax, %r9, 13, 14);
1243 jmp .Laria_gfni_end;
1245 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1246 %xmm4, %xmm5, %xmm6, %xmm7,
1247 %xmm8, %xmm9, %xmm10, %xmm11,
1248 %xmm12, %xmm13, %xmm14,
1249 %xmm15, %rax, %r9, 13);
1250 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
1251 %xmm12, %xmm13, %xmm14, %xmm15,
1252 %xmm0, %xmm1, %xmm2, %xmm3,
1253 %xmm4, %xmm5, %xmm6, %xmm7,
1255 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
1256 %xmm4, %xmm5, %xmm6, %xmm7,
1257 %xmm8, %xmm9, %xmm10, %xmm11,
1258 %xmm12, %xmm13, %xmm14,
1259 %xmm15, %rax, %r9, 15, 16);
1261 debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
1262 %xmm9, %xmm13, %xmm0, %xmm5,
1263 %xmm10, %xmm14, %xmm3, %xmm6,
1264 %xmm11, %xmm15, %xmm2, %xmm7,
1269 SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)
1271 SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
1280 leaq ARIA_CTX_enc_key(CTX), %r9;
1282 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1283 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1286 call __aria_aesni_avx_gfni_crypt_16way;
1288 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1289 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1294 SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)
1296 SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
1305 leaq ARIA_CTX_dec_key(CTX), %r9;
1307 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
1308 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1311 call __aria_aesni_avx_gfni_crypt_16way;
1313 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1314 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1319 SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)
1321 SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
1327 * %r8: iv (big endian, 128bit)
1331 call __aria_aesni_avx_ctr_gen_keystream_16way
1337 leaq ARIA_CTX_enc_key(CTX), %r9;
1339 call __aria_aesni_avx_gfni_crypt_16way;
1341 vpxor (0 * 16)(%r11), %xmm1, %xmm1;
1342 vpxor (1 * 16)(%r11), %xmm0, %xmm0;
1343 vpxor (2 * 16)(%r11), %xmm3, %xmm3;
1344 vpxor (3 * 16)(%r11), %xmm2, %xmm2;
1345 vpxor (4 * 16)(%r11), %xmm4, %xmm4;
1346 vpxor (5 * 16)(%r11), %xmm5, %xmm5;
1347 vpxor (6 * 16)(%r11), %xmm6, %xmm6;
1348 vpxor (7 * 16)(%r11), %xmm7, %xmm7;
1349 vpxor (8 * 16)(%r11), %xmm8, %xmm8;
1350 vpxor (9 * 16)(%r11), %xmm9, %xmm9;
1351 vpxor (10 * 16)(%r11), %xmm10, %xmm10;
1352 vpxor (11 * 16)(%r11), %xmm11, %xmm11;
1353 vpxor (12 * 16)(%r11), %xmm12, %xmm12;
1354 vpxor (13 * 16)(%r11), %xmm13, %xmm13;
1355 vpxor (14 * 16)(%r11), %xmm14, %xmm14;
1356 vpxor (15 * 16)(%r11), %xmm15, %xmm15;
1357 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
1358 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
1363 SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)
1364 #endif /* CONFIG_AS_GFNI */