1 /* blowfish-arm.S - ARM assembly implementation of Blowfish cipher
3 * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
5 * This file is part of Libgcrypt.
7 * Libgcrypt is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as
9 * published by the Free Software Foundation; either version 2.1 of
10 * the License, or (at your option) any later version.
12 * Libgcrypt is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
23 #if defined(__ARMEL__)
24 #ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
31 /* structure of crypto context */
33 #define s1 (s0 + (1 * 256) * 4)
34 #define s2 (s0 + (2 * 256) * 4)
35 #define s3 (s0 + (3 * 256) * 4)
36 #define p (s3 + (1 * 256) * 4)
59 #define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
60 ldrb rout, [rsrc, #((offs) + 0)]; \
61 ldrb rtmp, [rsrc, #((offs) + 1)]; \
62 orr rout, rout, rtmp, lsl #8; \
63 ldrb rtmp, [rsrc, #((offs) + 2)]; \
64 orr rout, rout, rtmp, lsl #16; \
65 ldrb rtmp, [rsrc, #((offs) + 3)]; \
66 orr rout, rout, rtmp, lsl #24;
68 #define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \
69 mov rtmp0, rin, lsr #8; \
70 strb rin, [rdst, #((offs) + 0)]; \
71 mov rtmp1, rin, lsr #16; \
72 strb rtmp0, [rdst, #((offs) + 1)]; \
73 mov rtmp0, rin, lsr #24; \
74 strb rtmp1, [rdst, #((offs) + 2)]; \
75 strb rtmp0, [rdst, #((offs) + 3)];
77 #define ldr_unaligned_be(rout, rsrc, offs, rtmp) \
78 ldrb rout, [rsrc, #((offs) + 3)]; \
79 ldrb rtmp, [rsrc, #((offs) + 2)]; \
80 orr rout, rout, rtmp, lsl #8; \
81 ldrb rtmp, [rsrc, #((offs) + 1)]; \
82 orr rout, rout, rtmp, lsl #16; \
83 ldrb rtmp, [rsrc, #((offs) + 0)]; \
84 orr rout, rout, rtmp, lsl #24;
86 #define str_unaligned_be(rin, rdst, offs, rtmp0, rtmp1) \
87 mov rtmp0, rin, lsr #8; \
88 strb rin, [rdst, #((offs) + 3)]; \
89 mov rtmp1, rin, lsr #16; \
90 strb rtmp0, [rdst, #((offs) + 2)]; \
91 mov rtmp0, rin, lsr #24; \
92 strb rtmp1, [rdst, #((offs) + 1)]; \
93 strb rtmp0, [rdst, #((offs) + 0)];
96 #define ldr_unaligned_host ldr_unaligned_le
97 #define str_unaligned_host str_unaligned_le
99 /* bswap on little-endian */
100 #ifdef HAVE_ARM_ARCH_V6
101 #define host_to_be(reg, rtmp) \
103 #define be_to_host(reg, rtmp) \
106 #define host_to_be(reg, rtmp) \
107 eor rtmp, reg, reg, ror #16; \
108 mov rtmp, rtmp, lsr #8; \
109 bic rtmp, rtmp, #65280; \
110 eor reg, rtmp, reg, ror #8;
111 #define be_to_host(reg, rtmp) \
112 eor rtmp, reg, reg, ror #16; \
113 mov rtmp, rtmp, lsr #8; \
114 bic rtmp, rtmp, #65280; \
115 eor reg, rtmp, reg, ror #8;
118 #define ldr_unaligned_host ldr_unaligned_be
119 #define str_unaligned_host str_unaligned_be
121 /* nop on big-endian */
122 #define host_to_be(reg, rtmp) /*_*/
123 #define be_to_host(reg, rtmp) /*_*/
126 #define host_to_host(x, y) /*_*/
128 /***********************************************************************
130 ***********************************************************************/
132 and RT0, RMASK, l, lsr#(24 - 2); \
133 and RT1, RMASK, l, lsr#(16 - 2); \
134 ldr RT0, [CTXs0, RT0]; \
135 and RT2, RMASK, l, lsr#(8 - 2); \
136 ldr RT1, [CTXs1, RT1]; \
137 and RT3, RMASK, l, lsl#2; \
138 ldr RT2, [CTXs2, RT2]; \
140 ldr RT3, [CTXs3, RT3]; \
145 #define load_roundkey_enc(n) \
146 ldr RKEYL, [CTXs2, #((p - s2) + (4 * (n) + 0))]; \
147 ldr RKEYR, [CTXs2, #((p - s2) + (4 * (n) + 4))];
149 #define add_roundkey_enc() \
153 #define round_enc(n) \
154 add_roundkey_enc(); \
155 load_roundkey_enc(n); \
160 #define load_roundkey_dec(n) \
161 ldr RKEYL, [CTXs2, #((p - s2) + (4 * ((n) - 1) + 4))]; \
162 ldr RKEYR, [CTXs2, #((p - s2) + (4 * ((n) - 1) + 0))];
164 #define add_roundkey_dec() \
168 #define round_dec(n) \
169 add_roundkey_dec(); \
170 load_roundkey_dec(n); \
175 #define read_block_aligned(rin, offs, l0, r0, convert, rtmp) \
176 ldr l0, [rin, #((offs) + 0)]; \
177 ldr r0, [rin, #((offs) + 4)]; \
181 #define write_block_aligned(rout, offs, l0, r0, convert, rtmp) \
184 str l0, [rout, #((offs) + 0)]; \
185 str r0, [rout, #((offs) + 4)];
187 #ifdef __ARM_FEATURE_UNALIGNED
188 /* unaligned word reads allowed */
189 #define read_block(rin, offs, l0, r0, rtmp0) \
190 read_block_aligned(rin, offs, l0, r0, host_to_be, rtmp0)
192 #define write_block(rout, offs, r0, l0, rtmp0, rtmp1) \
193 write_block_aligned(rout, offs, r0, l0, be_to_host, rtmp0)
195 #define read_block_host(rin, offs, l0, r0, rtmp0) \
196 read_block_aligned(rin, offs, l0, r0, host_to_host, rtmp0)
198 #define write_block_host(rout, offs, r0, l0, rtmp0, rtmp1) \
199 write_block_aligned(rout, offs, r0, l0, host_to_host, rtmp0)
201 /* need to handle unaligned reads by byte reads */
202 #define read_block(rin, offs, l0, r0, rtmp0) \
205 ldr_unaligned_be(l0, rin, (offs) + 0, rtmp0); \
206 ldr_unaligned_be(r0, rin, (offs) + 4, rtmp0); \
209 read_block_aligned(rin, offs, l0, r0, host_to_be, rtmp0); \
212 #define write_block(rout, offs, l0, r0, rtmp0, rtmp1) \
215 str_unaligned_be(l0, rout, (offs) + 0, rtmp0, rtmp1); \
216 str_unaligned_be(r0, rout, (offs) + 4, rtmp0, rtmp1); \
219 write_block_aligned(rout, offs, l0, r0, be_to_host, rtmp0); \
222 #define read_block_host(rin, offs, l0, r0, rtmp0) \
225 ldr_unaligned_host(l0, rin, (offs) + 0, rtmp0); \
226 ldr_unaligned_host(r0, rin, (offs) + 4, rtmp0); \
229 read_block_aligned(rin, offs, l0, r0, host_to_host, rtmp0); \
232 #define write_block_host(rout, offs, l0, r0, rtmp0, rtmp1) \
235 str_unaligned_host(l0, rout, (offs) + 0, rtmp0, rtmp1); \
236 str_unaligned_host(r0, rout, (offs) + 4, rtmp0, rtmp1); \
239 write_block_aligned(rout, offs, l0, r0, host_to_host); \
244 .type __blowfish_enc_blk1,%function;
255 add CTXs1, CTXs0, #(s1 - s0);
256 add CTXs2, CTXs0, #(s2 - s0);
257 mov RMASK, #(0xff << 2); /* byte mask */
258 add CTXs3, CTXs1, #(s3 - s1);
260 load_roundkey_enc(0);
272 .size __blowfish_enc_blk1,.-__blowfish_enc_blk1;
275 .globl _gcry_blowfish_arm_do_encrypt
276 .type _gcry_blowfish_arm_do_encrypt,%function;
278 _gcry_blowfish_arm_do_encrypt:
284 push {%r2, %r4-%r11, %ip, %lr};
289 bl __blowfish_enc_blk1;
295 pop {%r4-%r11, %ip, %pc};
296 .size _gcry_blowfish_arm_do_encrypt,.-_gcry_blowfish_arm_do_encrypt;
299 .globl _gcry_blowfish_arm_encrypt_block
300 .type _gcry_blowfish_arm_encrypt_block,%function;
302 _gcry_blowfish_arm_encrypt_block:
308 push {%r4-%r11, %ip, %lr};
310 read_block(%r2, 0, RL0, RR0, RT0);
312 bl __blowfish_enc_blk1;
314 write_block(%r1, 0, RR0, RL0, RT0, RT1);
316 pop {%r4-%r11, %ip, %pc};
317 .size _gcry_blowfish_arm_encrypt_block,.-_gcry_blowfish_arm_encrypt_block;
320 .globl _gcry_blowfish_arm_decrypt_block
321 .type _gcry_blowfish_arm_decrypt_block,%function;
323 _gcry_blowfish_arm_decrypt_block:
329 push {%r4-%r11, %ip, %lr};
331 add CTXs1, CTXs0, #(s1 - s0);
332 add CTXs2, CTXs0, #(s2 - s0);
333 mov RMASK, #(0xff << 2); /* byte mask */
334 add CTXs3, CTXs1, #(s3 - s1);
336 read_block(%r2, 0, RL0, RR0, RT0);
338 load_roundkey_dec(17);
349 write_block(%r1, 0, RR0, RL0, RT0, RT1);
351 pop {%r4-%r11, %ip, %pc};
352 .size _gcry_blowfish_arm_decrypt_block,.-_gcry_blowfish_arm_decrypt_block;
354 /***********************************************************************
356 ***********************************************************************/
357 #define F2(n, l0, r0, l1, r1, set_nextk, dec) \
359 and RT0, RMASK, l0, lsr#(24 - 2); \
360 and RT1, RMASK, l0, lsr#(16 - 2); \
361 and RT2, RMASK, l0, lsr#(8 - 2); \
362 add RT1, #(s1 - s0); \
364 ldr RT0, [CTXs0, RT0]; \
365 and RT3, RMASK, l0, lsl#2; \
366 ldr RT1, [CTXs0, RT1]; \
367 add RT3, #(s3 - s2); \
368 ldr RT2, [CTXs2, RT2]; \
370 ldr RT3, [CTXs2, RT3]; \
372 and RT1, RMASK, l1, lsr#(24 - 2); \
374 and RT2, RMASK, l1, lsr#(16 - 2); \
376 add RT2, #(s1 - s0); \
377 and RT3, RMASK, l1, lsr#(8 - 2); \
380 ldr RT1, [CTXs0, RT1]; \
381 and RT0, RMASK, l1, lsl#2; \
382 ldr RT2, [CTXs0, RT2]; \
383 add RT0, #(s3 - s2); \
384 ldr RT3, [CTXs2, RT3]; \
386 ldr RT0, [CTXs2, RT0]; \
388 and RT2, RMASK, r0, lsr#(24 - 2); \
390 and RT3, RMASK, r0, lsr#(16 - 2); \
392 add RT3, #(s1 - s0); \
393 and RT0, RMASK, r0, lsr#(8 - 2); \
396 ldr RT2, [CTXs0, RT2]; \
397 and RT1, RMASK, r0, lsl#2; \
398 ldr RT3, [CTXs0, RT3]; \
399 add RT1, #(s3 - s2); \
400 ldr RT0, [CTXs2, RT0]; \
402 ldr RT1, [CTXs2, RT1]; \
404 and RT3, RMASK, r1, lsr#(24 - 2); \
406 and RT0, RMASK, r1, lsr#(16 - 2); \
408 add RT0, #(s1 - s0); \
409 and RT1, RMASK, r1, lsr#(8 - 2); \
412 ldr RT3, [CTXs0, RT3]; \
413 and RT2, RMASK, r1, lsl#2; \
414 ldr RT0, [CTXs0, RT0]; \
415 add RT2, #(s3 - s2); \
416 ldr RT1, [CTXs2, RT1]; \
418 ldr RT2, [CTXs2, RT2]; \
426 set_nextk(RKEYL, (p - s2) + (4 * (n) + ((dec) * 4))); \
428 set_nextk(RKEYR, (p - s2) + (4 * (n) + (!(dec) * 4)));
430 #define load_n_add_roundkey_enc2(n) \
431 load_roundkey_enc(n); \
436 load_roundkey_enc((n) + 2);
438 #define next_key(reg, offs) \
439 ldr reg, [CTXs2, #(offs)];
441 #define dummy(x, y) /* do nothing */
443 #define round_enc2(n, load_next_key) \
444 F2((n) + 2, RL0, RR0, RL1, RR1, load_next_key, 0);
446 #define load_n_add_roundkey_dec2(n) \
447 load_roundkey_dec(n); \
452 load_roundkey_dec((n) - 2);
454 #define round_dec2(n, load_next_key) \
455 F2((n) - 3, RL0, RR0, RL1, RR1, load_next_key, 1);
457 #define read_block2_aligned(rin, l0, r0, l1, r1, convert, rtmp) \
458 ldr l0, [rin, #(0)]; \
459 ldr r0, [rin, #(4)]; \
461 ldr l1, [rin, #(8)]; \
463 ldr r1, [rin, #(12)]; \
467 #define write_block2_aligned(rout, l0, r0, l1, r1, convert, rtmp) \
471 str l0, [rout, #(0)]; \
473 str r0, [rout, #(4)]; \
474 str l1, [rout, #(8)]; \
475 str r1, [rout, #(12)];
477 #ifdef __ARM_FEATURE_UNALIGNED
478 /* unaligned word reads allowed */
479 #define read_block2(rin, l0, r0, l1, r1, rtmp0) \
480 read_block2_aligned(rin, l0, r0, l1, r1, host_to_be, rtmp0)
482 #define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
483 write_block2_aligned(rout, l0, r0, l1, r1, be_to_host, rtmp0)
485 #define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
486 read_block2_aligned(rin, l0, r0, l1, r1, host_to_host, rtmp0)
488 #define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
489 write_block2_aligned(rout, l0, r0, l1, r1, host_to_host, rtmp0)
491 /* need to handle unaligned reads by byte reads */
492 #define read_block2(rin, l0, r0, l1, r1, rtmp0) \
495 ldr_unaligned_be(l0, rin, 0, rtmp0); \
496 ldr_unaligned_be(r0, rin, 4, rtmp0); \
497 ldr_unaligned_be(l1, rin, 8, rtmp0); \
498 ldr_unaligned_be(r1, rin, 12, rtmp0); \
501 read_block2_aligned(rin, l0, r0, l1, r1, host_to_be, rtmp0); \
504 #define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
507 str_unaligned_be(l0, rout, 0, rtmp0, rtmp1); \
508 str_unaligned_be(r0, rout, 4, rtmp0, rtmp1); \
509 str_unaligned_be(l1, rout, 8, rtmp0, rtmp1); \
510 str_unaligned_be(r1, rout, 12, rtmp0, rtmp1); \
513 write_block2_aligned(rout, l0, r0, l1, r1, be_to_host, rtmp0); \
516 #define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
519 ldr_unaligned_host(l0, rin, 0, rtmp0); \
520 ldr_unaligned_host(r0, rin, 4, rtmp0); \
521 ldr_unaligned_host(l1, rin, 8, rtmp0); \
522 ldr_unaligned_host(r1, rin, 12, rtmp0); \
525 read_block2_aligned(rin, l0, r0, l1, r1, host_to_host, rtmp0); \
528 #define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
531 str_unaligned_host(l0, rout, 0, rtmp0, rtmp1); \
532 str_unaligned_host(r0, rout, 4, rtmp0, rtmp1); \
533 str_unaligned_host(l1, rout, 8, rtmp0, rtmp1); \
534 str_unaligned_host(r1, rout, 12, rtmp0, rtmp1); \
537 write_block2_aligned(rout, l0, r0, l1, r1, host_to_host, rtmp0); \
542 .type _gcry_blowfish_arm_enc_blk2,%function;
544 _gcry_blowfish_arm_enc_blk2:
547 * [RL0, RR0], [RL1, RR1]: src
549 * [RR0, RL0], [RR1, RL1]: dst
553 add CTXs2, CTXs0, #(s2 - s0);
554 mov RMASK, #(0xff << 2); /* byte mask */
556 load_n_add_roundkey_enc2(0);
557 round_enc2(2, next_key);
558 round_enc2(4, next_key);
559 round_enc2(6, next_key);
560 round_enc2(8, next_key);
561 round_enc2(10, next_key);
562 round_enc2(12, next_key);
563 round_enc2(14, next_key);
564 round_enc2(16, dummy);
566 host_to_be(RR0, RT0);
567 host_to_be(RL0, RT0);
568 host_to_be(RR1, RT0);
569 host_to_be(RL1, RT0);
572 .size _gcry_blowfish_arm_enc_blk2,.-_gcry_blowfish_arm_enc_blk2;
575 .globl _gcry_blowfish_arm_cfb_dec;
576 .type _gcry_blowfish_arm_cfb_dec,%function;
578 _gcry_blowfish_arm_cfb_dec:
581 * %r1: dst (2 blocks)
582 * %r2: src (2 blocks)
585 push {%r2, %r4-%r11, %ip, %lr};
589 /* Load input (iv/%r3 is aligned, src/%r2 might not be) */
591 host_to_be(RL0, RT0);
592 host_to_be(RR0, RT0);
593 read_block(%r2, 0, RL1, RR1, RT0);
595 /* Update IV, load src[1] and save to iv[0] */
596 read_block_host(%r2, 8, %r5, %r6, RT0);
599 bl _gcry_blowfish_arm_enc_blk2;
600 /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
602 /* %r1: dst, %r0: %src */
605 /* dst = src ^ result */
606 read_block2_host(%r0, %r5, %r6, %r7, %r8, %lr);
611 write_block2_host(%r1, %r5, %r6, %r7, %r8, %r9, %r10);
613 pop {%r4-%r11, %ip, %pc};
615 .size _gcry_blowfish_arm_cfb_dec,.-_gcry_blowfish_arm_cfb_dec;
618 .globl _gcry_blowfish_arm_ctr_enc;
619 .type _gcry_blowfish_arm_ctr_enc,%function;
621 _gcry_blowfish_arm_ctr_enc:
624 * %r1: dst (2 blocks)
625 * %r2: src (2 blocks)
626 * %r3: iv (64bit, big-endian)
628 push {%r2, %r4-%r11, %ip, %lr};
632 /* Load IV (big => host endian) */
633 read_block_aligned(%lr, 0, RL0, RR0, be_to_host, RT0);
636 adds RR1, RR0, #1; /* +1 */
638 adds %r6, RR1, #1; /* +2 */
641 /* Store new IV (host => big-endian) */
642 write_block_aligned(%lr, 0, %r5, %r6, host_to_be, RT0);
644 bl _gcry_blowfish_arm_enc_blk2;
645 /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
647 /* %r1: dst, %r0: %src */
650 /* XOR key-stream with plaintext */
651 read_block2_host(%r0, %r5, %r6, %r7, %r8, %lr);
656 write_block2_host(%r1, %r5, %r6, %r7, %r8, %r9, %r10);
658 pop {%r4-%r11, %ip, %pc};
660 .size _gcry_blowfish_arm_ctr_enc,.-_gcry_blowfish_arm_ctr_enc;
663 .type _gcry_blowfish_arm_dec_blk2,%function;
665 _gcry_blowfish_arm_dec_blk2:
668 * [RL0, RR0], [RL1, RR1]: src
670 * [RR0, RL0], [RR1, RL1]: dst
672 add CTXs2, CTXs0, #(s2 - s0);
673 mov RMASK, #(0xff << 2); /* byte mask */
675 load_n_add_roundkey_dec2(17);
676 round_dec2(15, next_key);
677 round_dec2(13, next_key);
678 round_dec2(11, next_key);
679 round_dec2(9, next_key);
680 round_dec2(7, next_key);
681 round_dec2(5, next_key);
682 round_dec2(3, next_key);
683 round_dec2(1, dummy);
685 host_to_be(RR0, RT0);
686 host_to_be(RL0, RT0);
687 host_to_be(RR1, RT0);
688 host_to_be(RL1, RT0);
692 .size _gcry_blowfish_arm_dec_blk2,.-_gcry_blowfish_arm_dec_blk2;
695 .globl _gcry_blowfish_arm_cbc_dec;
696 .type _gcry_blowfish_arm_cbc_dec,%function;
698 _gcry_blowfish_arm_cbc_dec:
701 * %r1: dst (2 blocks)
702 * %r2: src (2 blocks)
705 push {%r2-%r11, %ip, %lr};
707 read_block2(%r2, RL0, RR0, RL1, RR1, RT0);
709 /* dec_blk2 is only used by cbc_dec, jump directly in/out instead
710 * of function call. */
711 b _gcry_blowfish_arm_dec_blk2;
713 /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
715 /* %r0: %src, %r1: dst, %r2: iv */
718 /* load IV+1 (src[0]) to %r7:%r8. Might be unaligned. */
719 read_block_host(%r0, 0, %r7, %r8, %r5);
720 /* load IV (iv[0]) to %r5:%r6. 'iv' is aligned. */
730 /* load IV+2 (src[1]) to %r7:%r8. Might be unaligned. */
731 read_block_host(%r0, 8, %r7, %r8, %r5);
732 /* store IV+2 to iv[0] (aligned). */
735 /* store result to dst[0-3]. Might be unaligned. */
736 write_block2_host(%r1, %r4, %r3, %r10, %r9, %r5, %r6);
738 pop {%r4-%r11, %ip, %pc};
740 .size _gcry_blowfish_arm_cbc_dec,.-_gcry_blowfish_arm_cbc_dec;
742 #endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
743 #endif /*__ARM_ARCH >= 6*/